summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-06-01 20:58:36 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-06-01 20:58:36 +0000
commitf382538d471e38a9b98f016c4caebd24c8d60b62 (patch)
treed30f3d58b1044b5355d50c17a6a96c6a0b35703a /test
parentee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff)
Diffstat (limited to 'test')
-rw-r--r--test/Analysis/CFLAliasAnalysis/Andersen/struct.ll18
-rw-r--r--test/Bitcode/thinlto-function-summary-callgraph.ll13
-rw-r--r--test/CodeGen/AArch64/GlobalISel/localizer.mir49
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-pr32733.mir1
-rw-r--r--test/CodeGen/AArch64/addcarry-crash.ll23
-rw-r--r--test/CodeGen/AArch64/misched-fusion-aes.ll70
-rw-r--r--test/CodeGen/AArch64/pr33172.ll32
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll2
-rw-r--r--test/CodeGen/AMDGPU/merge-m0.mir1
-rw-r--r--test/CodeGen/AMDGPU/sdwa-scalar-ops.mir2
-rw-r--r--test/CodeGen/AMDGPU/waitcnt-permute.mir33
-rw-r--r--test/CodeGen/ARM/cmpxchg-O0.ll9
-rw-r--r--test/CodeGen/ARM/v6-jumptable-clobber.mir2
-rw-r--r--test/CodeGen/AVR/rot.ll4
-rw-r--r--test/CodeGen/Hexagon/invalid-dotnew-attempt.mir17
-rw-r--r--test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll62
-rw-r--r--test/CodeGen/Hexagon/mul64-sext.ll93
-rw-r--r--test/CodeGen/MIR/Generic/multiRunPass.mir4
-rw-r--r--test/CodeGen/Mips/compactbranches/empty-block.mir1
-rw-r--r--test/CodeGen/PowerPC/expand-isel.ll15
-rw-r--r--test/CodeGen/PowerPC/logic-ops-on-compares.ll130
-rw-r--r--test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll121
-rw-r--r--test/CodeGen/PowerPC/memcmp.ll87
-rw-r--r--test/CodeGen/PowerPC/memcmpIR.ll194
-rw-r--r--test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll49
-rw-r--r--test/CodeGen/PowerPC/pristine-and-livein.mir330
-rw-r--r--test/CodeGen/PowerPC/testComparesieqsll.ll134
-rw-r--r--test/CodeGen/PowerPC/testComparesiequll.ll134
-rw-r--r--test/CodeGen/PowerPC/testCompareslleqsll.ll133
-rw-r--r--test/CodeGen/PowerPC/testComparesllequll.ll133
-rw-r--r--test/CodeGen/PowerPC/vec_xxpermdi.ll307
-rw-r--r--test/CodeGen/Thumb2/tbb-removeadd.mir1
-rw-r--r--test/CodeGen/X86/2007-01-08-InstrSched.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/irtranslator-call.ll1
-rw-r--r--test/CodeGen/X86/add-of-carry.ll6
-rw-r--r--test/CodeGen/X86/addcarry.ll21
-rw-r--r--test/CodeGen/X86/avg.ll833
-rw-r--r--test/CodeGen/X86/avx.ll2
-rw-r--r--test/CodeGen/X86/avx512-cmp-kor-sequence.ll6
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll10
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll44
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll83
-rw-r--r--test/CodeGen/X86/avx512-mask-spills.ll40
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll12
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll16
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll24
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll2
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll2
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll10
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512ifma-intrinsics.ll8
-rw-r--r--test/CodeGen/X86/avx512ifmavl-intrinsics.ll16
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll64
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll28
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll631
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll823
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll637
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll254
-rw-r--r--test/CodeGen/X86/bswap_tree2.ll35
-rw-r--r--test/CodeGen/X86/eh-unknown.ll32
-rw-r--r--test/CodeGen/X86/fmsubadd-combine.ll8
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll7
-rw-r--r--test/CodeGen/X86/fp128-i128.ll2
-rw-r--r--test/CodeGen/X86/gnu-seh-nolpads.ll34
-rw-r--r--test/CodeGen/X86/implicit-null-checks.mir22
-rw-r--r--test/CodeGen/X86/lrshrink.ll57
-rw-r--r--test/CodeGen/X86/madd.ll34
-rw-r--r--test/CodeGen/X86/misched-matrix.ll4
-rw-r--r--test/CodeGen/X86/mul-constant-i16.ll141
-rw-r--r--test/CodeGen/X86/mul-constant-i32.ll1589
-rw-r--r--test/CodeGen/X86/mul-constant-i64.ll1612
-rw-r--r--test/CodeGen/X86/oddshuffles.ll34
-rw-r--r--test/CodeGen/X86/pmul.ll55
-rw-r--r--test/CodeGen/X86/pr32284.ll206
-rw-r--r--test/CodeGen/X86/pr32610.ll40
-rw-r--r--test/CodeGen/X86/rotate.ll16
-rw-r--r--test/CodeGen/X86/sad.ll929
-rw-r--r--test/CodeGen/X86/select.ll28
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll61
-rw-r--r--test/CodeGen/X86/setcc-wide-types.ll56
-rw-r--r--test/CodeGen/X86/shrink_vmul_sse.ll2
-rw-r--r--test/CodeGen/X86/sse41.ll8
-rw-r--r--test/CodeGen/X86/vector-bitreverse.ll6
-rw-r--r--test/CodeGen/X86/vector-blend.ll4
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll14
-rw-r--r--test/CodeGen/X86/xchg-nofold.ll24
-rw-r--r--test/DebugInfo/MIR/X86/empty-inline.mir1
-rw-r--r--test/DebugInfo/omit-empty.ll1
-rw-r--r--test/Instrumentation/SanitizerCoverage/coverage-dbg.ll4
-rw-r--r--test/Instrumentation/SanitizerCoverage/coverage.ll11
-rw-r--r--test/Instrumentation/SanitizerCoverage/seh.ll1
-rw-r--r--test/MC/AMDGPU/ds-err.s90
-rw-r--r--test/MC/AMDGPU/ds.s144
-rw-r--r--test/MC/ARM/big-endian-thumb-fixup.s1
-rw-r--r--test/MC/ARM/mixed-arm-thumb-bl-fixup.ll77
-rw-r--r--test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt20
-rw-r--r--test/Other/new-pm-defaults.ll8
-rw-r--r--test/Other/new-pm-thinlto-defaults.ll221
-rw-r--r--test/ThinLTO/X86/newpm-basic.ll (renamed from test/ThinLTO/X86/error-newpm.ll)6
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineAlloca.ll68
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineAlloca2.ll65
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineAlloca4.ll67
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineAlloca5.ll67
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll61
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll62
-rw-r--r--test/Transforms/GVN/PRE/phi-translate-2.ll105
-rw-r--r--test/Transforms/GVN/PRE/pre-gep-load.ll2
-rw-r--r--test/Transforms/GVN/PRE/pre-load.ll6
-rw-r--r--test/Transforms/Inline/AArch64/gep-cost.ll25
-rw-r--r--test/Transforms/InstCombine/ctpop.ll16
-rw-r--r--test/Transforms/InstCombine/intrinsics.ll28
-rw-r--r--test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll49
-rw-r--r--test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll104
-rw-r--r--test/Transforms/NewGVN/completeness.ll17
-rw-r--r--test/Transforms/NewGVN/pr33185.ll59
-rw-r--r--test/Transforms/PGOProfile/branch1.ll5
-rw-r--r--test/Transforms/ThinLTOBitcodeWriter/new-pm.ll9
-rw-r--r--test/Transforms/Util/PredicateInfo/condprop.ll7
-rw-r--r--test/Transforms/Util/PredicateInfo/testandor.ll27
-rw-r--r--test/tools/llvm-config/cflags.test2
-rw-r--r--test/tools/llvm-cvtres/Inputs/test_resource.rc6
-rw-r--r--test/tools/llvm-cvtres/Inputs/test_resource.resbin2200 -> 2332 bytes
-rw-r--r--test/tools/llvm-cvtres/resource.test46
125 files changed, 5980 insertions, 6359 deletions
diff --git a/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll
new file mode 100644
index 0000000000000..c1d25c1e3c214
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll
@@ -0,0 +1,18 @@
+; Ensures that our struct ops are sane.
+
+; RUN: opt < %s -disable-basicaa -cfl-anders-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -aa-pipeline=cfl-anders-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; Since we ignore non-pointer values, we effectively ignore extractvalue
+; instructions. This means that %c "doesn't exist" in test_structure's graph,
+; so we currently get MayAlias.
+; XFAIL: *
+
+; CHECK-LABEL: Function: test_structure
+; CHECK: NoAlias: i64** %c, { i64**, i64** }* %a
+define void @test_structure() {
+ %a = alloca {i64**, i64**}, align 8
+ %b = load {i64**, i64**}, {i64**, i64**}* %a
+ %c = extractvalue {i64**, i64**} %b, 0
+ ret void
+}
diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll
index 8cc60ad633621..566f3a077e7bf 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph.ll
@@ -11,20 +11,23 @@
; RUN: llvm-lto -thinlto-index-stats %p/Inputs/thinlto-function-summary-callgraph-combined.1.bc | FileCheck %s --check-prefix=OLD-COMBINED
; CHECK: <SOURCE_FILENAME
+; CHECK-NEXT: <GLOBALVAR
; CHECK-NEXT: <FUNCTION
; "func"
-; CHECK-NEXT: <FUNCTION op0=4 op1=4
+; CHECK-NEXT: <FUNCTION op0=17 op1=4
; CHECK: <GLOBALVAL_SUMMARY_BLOCK
; CHECK-NEXT: <VERSION
; See if the call to func is registered.
-; CHECK-NEXT: <PERMODULE {{.*}} op4=1/>
+; CHECK-NEXT: <PERMODULE {{.*}} op3=1
; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK>
; CHECK: <STRTAB_BLOCK
-; CHECK-NEXT: blob data = 'mainfunc'
+; CHECK-NEXT: blob data = 'undefinedglobmainfunc'
; COMBINED: <GLOBALVAL_SUMMARY_BLOCK
; COMBINED-NEXT: <VERSION
+; Only 2 VALUE_GUID since reference to undefinedglob should not be included in
+; combined index.
; COMBINED-NEXT: <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/>
; COMBINED-NEXT: <VALUE_GUID
; COMBINED-NEXT: <COMBINED
@@ -40,10 +43,12 @@ target triple = "x86_64-unknown-linux-gnu"
define i32 @main() #0 {
entry:
call void (...) @func()
- ret i32 0
+ %u = load i32, i32* @undefinedglob
+ ret i32 %u
}
declare void @func(...) #1
+@undefinedglob = external global i32
; OLD: Index {{.*}} contains 1 nodes (1 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls)
; OLD-COMBINED: Index {{.*}} contains 2 nodes (2 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls)
diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir
index 8fbb2040157e7..5bf8dac79860c 100644
--- a/test/CodeGen/AArch64/GlobalISel/localizer.mir
+++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir
@@ -12,6 +12,7 @@
define void @non_local_phi_use_followed_by_use() { ret void }
define void @non_local_phi_use_followed_by_use_fi() { ret void }
define void @float_non_local_phi_use_followed_by_use_fi() { ret void }
+ define void @non_local_phi() { ret void }
...
---
@@ -310,3 +311,51 @@ body: |
%3(s32) = PHI %0(s32), %bb.1
%2(s32) = G_FADD %3, %0
...
+
+---
+# Make sure we don't insert a constant before PHIs.
+# This used to happen for loops of one basic block.
+# CHECK-LABEL: name: non_local_phi
+name: non_local_phi
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK: registers:
+# Existing registers should be left untouched
+# CHECK: - { id: 0, class: fpr }
+#CHECK-NEXT: - { id: 1, class: fpr }
+#CHECK-NEXT: - { id: 2, class: fpr }
+#CHECK-NEXT: - { id: 3, class: fpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT: - { id: 4, class: fpr }
+
+registers:
+ - { id: 0, class: fpr }
+ - { id: 1, class: fpr }
+ - { id: 2, class: fpr }
+ - { id: 3, class: fpr }
+
+# CHECK: body:
+# CHECK: %0(s32) = G_FCONSTANT float 1.0
+# CHECK-NEXT: %1(s32) = G_FADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %3(s32) = PHI %1(s32), %bb.0, %4(s32), %bb.1
+# CHECK: %4(s32) = G_FCONSTANT float 1.0
+
+# CHECK-NEXT: %2(s32) = G_FADD %3, %1
+body: |
+ bb.0:
+ successors: %bb.1
+
+ %0(s32) = G_FCONSTANT float 1.0
+ %1(s32) = G_FADD %0, %0
+
+ bb.1:
+ successors: %bb.1
+
+ %3(s32) = PHI %1(s32), %bb.0, %0(s32), %bb.1
+ %2(s32) = G_FADD %3, %1
+ G_BR %bb.1
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir
index 96436209451b0..c35d1719f84c8 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir
@@ -13,7 +13,6 @@
name: main
alignment: 2
exposesReturnsTwice: false
-noVRegs: false
legalized: true
regBankSelected: true
selected: false
diff --git a/test/CodeGen/AArch64/addcarry-crash.ll b/test/CodeGen/AArch64/addcarry-crash.ll
new file mode 100644
index 0000000000000..ba833e0b5873c
--- /dev/null
+++ b/test/CodeGen/AArch64/addcarry-crash.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-ios7.0"
+
+define i64 @foo(i64* nocapture readonly %ptr, i64 %a, i64 %b, i64 %c) local_unnamed_addr #0 {
+; CHECK: ldr w8, [x0, #4]
+; CHECK: lsr x9, x1, #32
+; CHECK: cmn x3, x2
+; CHECK: mul x8, x8, x9
+; CHECK: cinc x0, x8, hs
+; CHECK: ret
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = load i64, i64* %ptr, align 8
+ %2 = lshr i64 %1, 32
+ %3 = mul nuw i64 %2, %0
+ %4 = add i64 %c, %b
+ %5 = icmp ult i64 %4, %c
+ %6 = zext i1 %5 to i64
+ %7 = add i64 %3, %6
+ ret i64 %7
+}
+
+attributes #0 = { norecurse nounwind readonly }
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index 1d8787212579a..bd7c69c910c0e 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,5 +1,7 @@
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -72,22 +74,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
ret void
; CHECK-LABEL: aesea:
-; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKCORTEX: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKCORTEX: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKCORTEX: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKCORTEX: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKCORTEX: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKCORTEX: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKCORTEX: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKCORTEX: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
@@ -173,22 +175,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
ret void
; CHECK-LABEL: aesda:
-; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKCORTEX: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKCORTEX: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKCORTEX: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKCORTEX: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKCORTEX: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKCORTEX: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKCORTEX: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKCORTEX: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
diff --git a/test/CodeGen/AArch64/pr33172.ll b/test/CodeGen/AArch64/pr33172.ll
new file mode 100644
index 0000000000000..1e1da78b28ff4
--- /dev/null
+++ b/test/CodeGen/AArch64/pr33172.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK-LABEL: pr33172
+; CHECK: ldp
+; CHECK: stp
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios10.3.0"
+
+@main.b = external global [200 x float], align 8
+@main.x = external global [200 x float], align 8
+
+; Function Attrs: nounwind ssp
+define void @pr33172() local_unnamed_addr {
+entry:
+ %wide.load8281058.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 12) to i64*), align 8
+ %wide.load8291059.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 14) to i64*), align 8
+ store i64 %wide.load8281058.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 12) to i64*), align 8
+ store i64 %wide.load8291059.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 14) to i64*), align 8
+ %wide.load8281058.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 16) to i64*), align 8
+ %wide.load8291059.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 18) to i64*), align 8
+ store i64 %wide.load8281058.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 16) to i64*), align 8
+ store i64 %wide.load8291059.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 18) to i64*), align 8
+ tail call void @llvm.memset.p0i8.i64(i8* bitcast ([200 x float]* @main.b to i8*), i8 0, i64 undef, i32 8, i1 false) #2
+ unreachable
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1
+
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index a3a78d326a628..02642142ae2cd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -4,7 +4,7 @@
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
; FUNC-LABEL: {{^}}ds_swizzle:
-; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
+; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11")
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
%swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir
index 064db49924e15..720642ad1ddb9 100644
--- a/test/CodeGen/AMDGPU/merge-m0.mir
+++ b/test/CodeGen/AMDGPU/merge-m0.mir
@@ -50,7 +50,6 @@
name: test
alignment: 0
exposesReturnsTwice: false
-noVRegs: false
legalized: false
regBankSelected: false
selected: false
diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
index cd50e01032c38..cd0d410368c7d 100644
--- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
+++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@@ -86,7 +86,6 @@
name: sdwa_imm_operand
alignment: 0
exposesReturnsTwice: false
-noVRegs: false
legalized: false
regBankSelected: false
selected: false
@@ -248,7 +247,6 @@ body: |
name: sdwa_sgpr_operand
alignment: 0
exposesReturnsTwice: false
-noVRegs: false
legalized: false
regBankSelected: false
selected: false
diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir
new file mode 100644
index 0000000000000..44dbd38f2d300
--- /dev/null
+++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir
@@ -0,0 +1,33 @@
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
+
+--- |
+ define float @waitcnt-permute(i32 %x, i32 %y) {
+ entry:
+ %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y)
+ %1 = bitcast i32 %0 to float
+ %2 = fadd float 1.000000e+00, %1
+ ret float %2
+ }
+
+ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32)
+
+...
+---
+# CHECK-LABEL: name: waitcnt-permute{{$}}
+# CHECK: DS_BPERMUTE_B32
+# CHECK-NEXT: S_WAITCNT 127
+
+name: waitcnt-permute
+liveins:
+ - { reg: '%vgpr0' }
+ - { reg: '%vgpr1' }
+ - { reg: '%sgpr30_sgpr31' }
+body: |
+ bb.0:
+ liveins: %vgpr0, %vgpr1, %sgpr30_sgpr31
+
+ %vgpr0 = DS_BPERMUTE_B32 killed %vgpr0, killed %vgpr1, 0, implicit %exec
+ %vgpr0 = V_ADD_F32_e32 1065353216, killed %vgpr0, implicit %exec
+ S_SETPC_B64_return killed %sgpr30_sgpr31, implicit killed %vgpr0
+
+...
diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll
index f8ad2bbbbe0e4..a3be72112c761 100644
--- a/test/CodeGen/ARM/cmpxchg-O0.ll
+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
@@ -10,10 +10,11 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
; CHECK: dmb ish
; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
; CHECK: ldrexb [[OLD:r[0-9]+]], [r0]
; CHECK: cmp [[OLD]], [[DESIRED]]
; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: strexb [[STATUS]], r2, [r0]
; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
; CHECK: bne [[RETRY]]
; CHECK: [[DONE]]:
@@ -29,10 +30,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
; CHECK: dmb ish
; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
; CHECK: ldrexh [[OLD:r[0-9]+]], [r0]
; CHECK: cmp [[OLD]], [[DESIRED]]
; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: strexh [[STATUS]], r2, [r0]
; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
; CHECK: bne [[RETRY]]
; CHECK: [[DONE]]:
@@ -48,10 +50,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
; CHECK: dmb ish
; CHECK-NOT: uxt
; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
; CHECK: ldrex [[OLD:r[0-9]+]], [r0]
; CHECK: cmp [[OLD]], [[DESIRED]]
; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: strex [[STATUS]], r2, [r0]
; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
; CHECK: bne [[RETRY]]
; CHECK: [[DONE]]:
diff --git a/test/CodeGen/ARM/v6-jumptable-clobber.mir b/test/CodeGen/ARM/v6-jumptable-clobber.mir
index 0e9bc42565f3b..6577ef8486713 100644
--- a/test/CodeGen/ARM/v6-jumptable-clobber.mir
+++ b/test/CodeGen/ARM/v6-jumptable-clobber.mir
@@ -190,7 +190,6 @@
name: foo
alignment: 1
exposesReturnsTwice: false
-noVRegs: true
legalized: false
regBankSelected: false
selected: false
@@ -289,7 +288,6 @@ body: |
name: bar
alignment: 1
exposesReturnsTwice: false
-noVRegs: true
legalized: false
regBankSelected: false
selected: false
diff --git a/test/CodeGen/AVR/rot.ll b/test/CodeGen/AVR/rot.ll
index e43daf3e6aa85..a7b77d97ba698 100644
--- a/test/CodeGen/AVR/rot.ll
+++ b/test/CodeGen/AVR/rot.ll
@@ -6,7 +6,7 @@
define i8 @rol8(i8 %val, i8 %amt) {
; CHECK: andi r22, 7
- ; CHECK-NEXT: cp r22, r0
+ ; CHECK-NEXT: cpi r22, 0
; CHECK-NEXT: breq LBB0_2
; CHECK-NEXT: LBB0_1:
@@ -32,7 +32,7 @@ define i8 @rol8(i8 %val, i8 %amt) {
define i8 @ror8(i8 %val, i8 %amt) {
; CHECK: andi r22, 7
- ; CHECK-NEXT: cp r22, r0
+ ; CHECK-NEXT: cpi r22, 0
; CHECK-NEXT: breq LBB1_2
; CHECK-NEXT: LBB1_1:
diff --git a/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir
new file mode 100644
index 0000000000000..2233e3289f112
--- /dev/null
+++ b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir
@@ -0,0 +1,17 @@
+# RUN: llc -march=hexagon -start-after if-converter %s -o - | FileCheck %s
+# CHECK: p0 = r0
+# CHECK-NEXT: jumpr r31
+
+# Make sure that the packetizer does not attempt to newify the J2_jumpr
+# only because of the def-use of p0.
+
+---
+name: fred
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: %d0
+ %p0 = C2_tfrrp %r0
+ J2_jumpr %r31, implicit-def %pc, implicit %p0
+...
+
diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll
new file mode 100644
index 0000000000000..b25010f2a90fe
--- /dev/null
+++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll
@@ -0,0 +1,62 @@
+; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s
+;
+; The number of nested selects caused the simplification loop to take
+; more than the maximum number of iterations. This caused the compiler
+; to crash under suspicion of an infinite loop. This (still reduced)
+; testcase shows a legitimate case where this limit was exceeded.
+; Instead of crashing, gracefully abort the simplification.
+;
+; Check for sane output.
+; CHECK: define void @fred
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define void @fred() unnamed_addr #0 {
+b0:
+ %v1 = select i1 false, i32 undef, i32 2
+ br label %b2
+
+b2: ; preds = %b2, %b0
+ %v3 = sext i16 undef to i32
+ %v4 = add nsw i32 %v1, %v3
+ %v5 = select i1 undef, i32 undef, i32 %v4
+ %v6 = icmp slt i32 %v5, undef
+ %v7 = select i1 %v6, i32 %v5, i32 undef
+ %v8 = icmp slt i32 %v7, 0
+ %v9 = select i1 %v8, i32 %v7, i32 0
+ %v10 = sub i32 undef, undef
+ %v11 = add i32 %v10, %v9
+ %v12 = sext i16 undef to i32
+ %v13 = sext i16 undef to i32
+ %v14 = add nsw i32 %v1, %v13
+ %v15 = select i1 undef, i32 undef, i32 %v14
+ %v16 = icmp slt i32 %v15, undef
+ %v17 = select i1 %v16, i32 %v15, i32 undef
+ %v18 = select i1 undef, i32 %v17, i32 %v12
+ %v19 = add i32 undef, %v18
+ %v20 = sext i16 undef to i32
+ %v21 = sext i16 0 to i32
+ %v22 = add nsw i32 %v1, %v21
+ %v23 = sext i16 undef to i32
+ %v24 = add nsw i32 %v1, %v23
+ %v25 = select i1 undef, i32 undef, i32 %v24
+ %v26 = icmp slt i32 %v25, %v22
+ %v27 = select i1 %v26, i32 %v25, i32 %v22
+ %v28 = icmp slt i32 %v27, %v20
+ %v29 = select i1 %v28, i32 %v27, i32 %v20
+ %v30 = add i32 undef, %v29
+ %v31 = add i32 %v11, undef
+ %v32 = add i32 %v31, undef
+ %v33 = add i32 %v32, %v19
+ %v34 = add i32 %v33, %v30
+ %v35 = add nsw i32 %v34, 32768
+ %v36 = icmp ult i32 %v35, 65536
+ %v37 = select i1 %v36, i32 %v34, i32 undef
+ br i1 undef, label %b2, label %b38
+
+b38: ; preds = %b2
+ unreachable
+}
+
+attributes #0 = { "target-cpu"="hexagonv60" }
diff --git a/test/CodeGen/Hexagon/mul64-sext.ll b/test/CodeGen/Hexagon/mul64-sext.ll
new file mode 100644
index 0000000000000..8bbe6649a1fbc
--- /dev/null
+++ b/test/CodeGen/Hexagon/mul64-sext.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target triple = "hexagon-unknown--elf"
+
+; CHECK-LABEL: mul_1
+; CHECK: r1:0 = mpy(r2,r0)
+define i64 @mul_1(i64 %a0, i64 %a1) #0 {
+b2:
+ %v3 = shl i64 %a0, 32
+ %v4 = ashr exact i64 %v3, 32
+ %v5 = shl i64 %a1, 32
+ %v6 = ashr exact i64 %v5, 32
+ %v7 = mul nsw i64 %v6, %v4
+ ret i64 %v7
+}
+
+; CHECK-LABEL: mul_2
+; CHECK: r0 = memb(r0+#0)
+; CHECK: r1:0 = mpy(r2,r0)
+; CHECK: jumpr r31
+define i64 @mul_2(i8* %a0, i64 %a1) #0 {
+b2:
+ %v3 = load i8, i8* %a0
+ %v4 = sext i8 %v3 to i64
+ %v5 = shl i64 %a1, 32
+ %v6 = ashr exact i64 %v5, 32
+ %v7 = mul nsw i64 %v6, %v4
+ ret i64 %v7
+}
+
+; CHECK-LABEL: mul_acc_1
+; CHECK: r5:4 += mpy(r2,r0)
+; CHECK: r1:0 = combine(r5,r4)
+; CHECK: jumpr r31
+define i64 @mul_acc_1(i64 %a0, i64 %a1, i64 %a2) #0 {
+b3:
+ %v4 = shl i64 %a0, 32
+ %v5 = ashr exact i64 %v4, 32
+ %v6 = shl i64 %a1, 32
+ %v7 = ashr exact i64 %v6, 32
+ %v8 = mul nsw i64 %v7, %v5
+ %v9 = add i64 %a2, %v8
+ ret i64 %v9
+}
+
+; CHECK-LABEL: mul_acc_2
+; CHECK: r2 = memw(r2+#0)
+; CHECK: r5:4 += mpy(r2,r0)
+; CHECK: r1:0 = combine(r5,r4)
+; CHECK: jumpr r31
+define i64 @mul_acc_2(i64 %a0, i32* %a1, i64 %a2) #0 {
+b3:
+ %v4 = shl i64 %a0, 32
+ %v5 = ashr exact i64 %v4, 32
+ %v6 = load i32, i32* %a1
+ %v7 = sext i32 %v6 to i64
+ %v8 = mul nsw i64 %v7, %v5
+ %v9 = add i64 %a2, %v8
+ ret i64 %v9
+}
+
+; CHECK-LABEL: mul_nac_1
+; CHECK: r5:4 -= mpy(r2,r0)
+; CHECK: r1:0 = combine(r5,r4)
+; CHECK: jumpr r31
+define i64 @mul_nac_1(i64 %a0, i64 %a1, i64 %a2) #0 {
+b3:
+ %v4 = shl i64 %a0, 32
+ %v5 = ashr exact i64 %v4, 32
+ %v6 = shl i64 %a1, 32
+ %v7 = ashr exact i64 %v6, 32
+ %v8 = mul nsw i64 %v7, %v5
+ %v9 = sub i64 %a2, %v8
+ ret i64 %v9
+}
+
+; CHECK-LABEL: mul_nac_2
+; CHECK: r0 = memw(r0+#0)
+; CHECK: r5:4 -= mpy(r2,r0)
+; CHECK: r1:0 = combine(r5,r4)
+; CHECK: jumpr r31
+define i64 @mul_nac_2(i32* %a0, i64 %a1, i64 %a2) #0 {
+b3:
+ %v4 = load i32, i32* %a0
+ %v5 = sext i32 %v4 to i64
+ %v6 = shl i64 %a1, 32
+ %v7 = ashr exact i64 %v6, 32
+ %v8 = mul nsw i64 %v7, %v5
+ %v9 = sub i64 %a2, %v8
+ ret i64 %v9
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MIR/Generic/multiRunPass.mir b/test/CodeGen/MIR/Generic/multiRunPass.mir
index bca007de80b7c..bd1c0d0b458e5 100644
--- a/test/CodeGen/MIR/Generic/multiRunPass.mir
+++ b/test/CodeGen/MIR/Generic/multiRunPass.mir
@@ -7,8 +7,8 @@
# This test ensures that the command line accepts
# several run passes on the same command line and
# actually create the proper pipeline for it.
-# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt
-# PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos
+# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt
+# PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos
# Make sure there are no other passes happening after what we asked.
# CHECK-NEXT: --- |
diff --git a/test/CodeGen/Mips/compactbranches/empty-block.mir b/test/CodeGen/Mips/compactbranches/empty-block.mir
index 7831e51e31579..7fb1afae91210 100644
--- a/test/CodeGen/Mips/compactbranches/empty-block.mir
+++ b/test/CodeGen/Mips/compactbranches/empty-block.mir
@@ -39,7 +39,6 @@
name: l5
alignment: 2
exposesReturnsTwice: false
-noVRegs: true
legalized: false
regBankSelected: false
selected: false
diff --git a/test/CodeGen/PowerPC/expand-isel.ll b/test/CodeGen/PowerPC/expand-isel.ll
index 553cc3c372e5b..c8707bda8e84a 100644
--- a/test/CodeGen/PowerPC/expand-isel.ll
+++ b/test/CodeGen/PowerPC/expand-isel.ll
@@ -212,13 +212,14 @@ cleanup:
ret i32 %retval.0
; CHECK-LABEL: @testComplexISEL
-; CHECK: bc 12, 2, [[TRUE:.LBB[0-9]+]]
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT: [[TRUE]]
-; CHECK-NEXT: addi r3, r12, 0
-; CHECK-NEXT: [[SUCCESSOR]]
-; CHECK-NEXT: clrldi r3, r3, 32
-; CHECK-NEXT: blr
+; CHECK-DAG: [[LI:r[0-9]+]], 1
+; CHECK-DAG: cmplwi [[LD:r[0-9]+]], 0
+; CHECK: beq cr0, [[EQ:.LBB[0-9_]+]]
+; CHECK: blr
+; CHECK: [[EQ]]
+; CHECK: xor [[XOR:r[0-9]+]]
+; CHECK: cntlzd [[CZ:r[0-9]+]], [[XOR]]
+; CHECK: rldicl [[SH:r[0-9]+]], [[CZ]], 58, 63
}
!1 = !{!2, !2, i64 0}
diff --git a/test/CodeGen/PowerPC/logic-ops-on-compares.ll b/test/CodeGen/PowerPC/logic-ops-on-compares.ll
new file mode 100644
index 0000000000000..df021c20ea86e
--- /dev/null
+++ b/test/CodeGen/PowerPC/logic-ops-on-compares.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+; Function Attrs: nounwind
+define signext i32 @logic_ne_32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; CHECK-LABEL: logic_ne_32:
+; CHECK: xor r7, r3, r4
+; CHECK-NEXT: li r6, 55
+; CHECK-NEXT: xor r5, r5, r6
+; CHECK-NEXT: or r7, r7, r4
+; CHECK-NEXT: cntlzw r5, r5
+; CHECK-NEXT: cntlzw r6, r7
+; CHECK-NEXT: srwi r6, r6, 5
+; CHECK-NEXT: srwi r5, r5, 5
+; CHECK-NEXT: or. r5, r6, r5
+; CHECK-NEXT: bc 4, 1
+entry:
+ %tobool = icmp eq i32 %a, %b
+ %tobool1 = icmp eq i32 %b, 0
+ %or.cond = and i1 %tobool, %tobool1
+ %tobool3 = icmp eq i32 %c, 55
+ %or.cond5 = or i1 %or.cond, %tobool3
+ br i1 %or.cond5, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %call = tail call signext i32 @foo(i32 signext %a) #2
+ br label %return
+
+if.end: ; preds = %entry
+ %call4 = tail call signext i32 @bar(i32 signext %b) #2
+ br label %return
+
+return: ; preds = %if.end, %if.then
+ %retval.0 = phi i32 [ %call4, %if.end ], [ %call, %if.then ]
+ ret i32 %retval.0
+}
+
+define void @neg_truncate_i32(i32 *%ptr) {
+; CHECK-LABEL: neg_truncate_i32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lwz r3, 0(r3)
+; CHECK-NEXT: rldicl. r3, r3, 0, 63
+; CHECK-NEXT: bclr 12, 2, 0
+; CHECK-NEXT: # BB#1: # %if.end29.thread136
+; CHECK-NEXT: .LBB1_2: # %if.end29
+entry:
+ %0 = load i32, i32* %ptr, align 4
+ %rem17127 = and i32 %0, 1
+ %cmp18 = icmp eq i32 %rem17127, 0
+ br label %if.else
+
+if.else: ; preds = %entry
+ br i1 %cmp18, label %if.end29, label %if.end29.thread136
+
+if.end29.thread136: ; preds = %if.else
+ unreachable
+
+if.end29: ; preds = %if.else
+ ret void
+
+}
+
+; Function Attrs: nounwind
+define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: logic_ne_64:
+; CHECK: xor r7, r3, r4
+; CHECK-NEXT: li r6, 55
+; CHECK-NEXT: xor r5, r5, r6
+; CHECK-NEXT: or r7, r7, r4
+; CHECK-NEXT: cntlzd r6, r7
+; CHECK-NEXT: cntlzd r5, r5
+; CHECK-NEXT: rldicl r6, r6, 58, 63
+; CHECK-NEXT: rldicl r5, r5, 58, 63
+; CHECK-NEXT: or. r5, r6, r5
+; CHECK-NEXT: bc 4, 1
+entry:
+ %tobool = icmp eq i64 %a, %b
+ %tobool1 = icmp eq i64 %b, 0
+ %or.cond = and i1 %tobool, %tobool1
+ %tobool3 = icmp eq i64 %c, 55
+ %or.cond5 = or i1 %or.cond, %tobool3
+ br i1 %or.cond5, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %call = tail call i64 @foo64(i64 %a) #2
+ br label %return
+
+if.end: ; preds = %entry
+ %call4 = tail call i64 @bar64(i64 %b) #2
+ br label %return
+
+return: ; preds = %if.end, %if.then
+ %retval.0 = phi i64 [ %call4, %if.end ], [ %call, %if.then ]
+ ret i64 %retval.0
+}
+
+define void @neg_truncate_i64(i64 *%ptr) {
+; CHECK-LABEL: neg_truncate_i64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: ld r3, 0(r3)
+; CHECK-NEXT: rldicl. r3, r3, 0, 63
+; CHECK-NEXT: bclr 12, 2, 0
+; CHECK-NEXT: # BB#1: # %if.end29.thread136
+; CHECK-NEXT: .LBB3_2: # %if.end29
+entry:
+ %0 = load i64, i64* %ptr, align 4
+ %rem17127 = and i64 %0, 1
+ %cmp18 = icmp eq i64 %rem17127, 0
+ br label %if.else
+
+if.else: ; preds = %entry
+ br i1 %cmp18, label %if.end29, label %if.end29.thread136
+
+if.end29.thread136: ; preds = %if.else
+ unreachable
+
+if.end29: ; preds = %if.else
+ ret void
+
+}
+
+declare signext i32 @foo(i32 signext)
+declare signext i32 @bar(i32 signext)
+declare i64 @foo64(i64)
+declare i64 @bar64(i64)
diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
new file mode 100644
index 0000000000000..3095429758f64
--- /dev/null
+++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,121 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+; Validate with if(memcmp())
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
+ %not.tobool = icmp ne i32 %call, 0
+ %. = zext i1 %not.tobool to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest01
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
+
+; Validate with if(memcmp() == 0)
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+ %not.cmp = icmp ne i32 %call, 0
+ %. = zext i1 %not.cmp to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest02
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
+
+; Validate with > 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+ %not.cmp = icmp slt i32 %call, 1
+ %. = zext i1 %not.cmp to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest03
+ ; CHECK-LABEL: %res_block
+ ; CHECK: cmpld
+ ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+ ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+ ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with < 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
+ %call.lobit = lshr i32 %call, 31
+ %call.lobit.not = xor i32 %call.lobit, 1
+ ret i32 %call.lobit.not
+
+ ; CHECK-LABEL: @zeroEqualityTest04
+ ; CHECK-LABEL: %res_block
+ ; CHECK: cmpld
+ ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+ ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+ ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+ %not.tobool = icmp eq i32 %call, 0
+ %cond = zext i1 %not.tobool to i32
+ ret i32 %cond
+
+ ; CHECK-LABEL: @zeroEqualityTest05
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK: li 3, 0
+}
+
+; Validate with !memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+ %not.lnot = icmp ne i32 %call, 0
+ %cond = zext i1 %not.lnot to i32
+ ret i32 %cond
+
+ ; CHECK-LABEL: @zeroEqualityTest06
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
diff --git a/test/CodeGen/PowerPC/memcmp.ll b/test/CodeGen/PowerPC/memcmp.ll
new file mode 100644
index 0000000000000..bae713cb2072c
--- /dev/null
+++ b/test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
+
+; Check size 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ldbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 4
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 2
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test3
+; CHECK: lhbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 1
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test4
+; CHECK: lbz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1
diff --git a/test/CodeGen/PowerPC/memcmpIR.ll b/test/CodeGen/PowerPC/memcmpIR.ll
new file mode 100644
index 0000000000000..f052cc258df8d
--- /dev/null
+++ b/test/CodeGen/PowerPC/memcmpIR.ll
@@ -0,0 +1,194 @@
+; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
+; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
+
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+entry:
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+ ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
+ ret i32 %call
+}
+
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
+ ret i32 %call
+}
+
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+ ret i32 %call
+}
+ ; CHECK: call = tail call signext i32 @memcmp
+ ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
+ ret i32 %call
+}
+
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) {
+ ; CHECK: call = tail call signext i32 @memcmp
+ ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %conv = sext i32 %SIZE to i64
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
+ ret i32 %call
+}
diff --git a/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll
new file mode 100644
index 0000000000000..7ca5332865caa
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT
+; RUN: llc < %s -march=ppc64 -mcpu=a2 -enable-ppc-prefetching=true | FileCheck %s -check-prefix=CHECK-DCBT
+
+; Function Attrs: nounwind
+define signext i32 @check_cache_line() local_unnamed_addr {
+entry:
+ %call = tail call i32* bitcast (i32* (...)* @magici to i32* ()*)()
+ %call115 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)()
+ %cmp16 = icmp sgt i32 %call115, 0
+ br i1 %cmp16, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %res.017 = phi i32 [ %add5, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %call, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %res.017
+ %1 = add nuw nsw i64 %indvars.iv, 16
+ %arrayidx4 = getelementptr inbounds i32, i32* %call, i64 %1
+ %2 = load i32, i32* %arrayidx4, align 4
+ %add5 = add nsw i32 %add, %2
+ %indvars.iv.next = add nuw i64 %indvars.iv, 1
+ %call1 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)()
+ %3 = sext i32 %call1 to i64
+ %cmp = icmp slt i64 %indvars.iv.next, %3
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+; CHECK-LABEL: check_cache_line
+; CHECK: dcbt
+; CHECK-NOT: dcbt
+; CHECK: blr
+; CHECK-DCBT-LABEL: check_cache_line
+; CHECK-DCBT: dcbt
+; CHECK-DCBT: dcbt
+; CHECK-DCBT: blr
+}
+
+declare i32* @magici(...) local_unnamed_addr
+
+declare signext i32 @iter(...) local_unnamed_addr
+
diff --git a/test/CodeGen/PowerPC/pristine-and-livein.mir b/test/CodeGen/PowerPC/pristine-and-livein.mir
deleted file mode 100644
index 6d93bb68c102c..0000000000000
--- a/test/CodeGen/PowerPC/pristine-and-livein.mir
+++ /dev/null
@@ -1,330 +0,0 @@
-# RUN: llc -run-pass=post-RA-sched %s -o - | FileCheck %s
-
-# CHECK: callee-saved-register: '[[REG:%x[0-9]+]]'
-# CHECK: callee-saved-register: '{{%x[0-9]+}}'
-# CHECK-NOT: [[REG]] = LI8 0
-# CHECK: STD killed [[REG]],
---- |
- ; ModuleID = '<stdin>'
- source_filename = "bugpoint-output-4d91ae2.bc"
- target datalayout = "e-m:e-i64:64-n32:64"
- target triple = "powerpc64le--linux-gnu"
-
- ; Function Attrs: norecurse nounwind readonly
- define i64 @adler32_z(i64 %adler, i8* readonly %buf, i64 %len) local_unnamed_addr #0 {
- entry:
- %shr = lshr i64 %adler, 16
- %and = and i64 %shr, 65535
- %and1 = and i64 %adler, 65535
- br i1 undef, label %if.then, label %if.end15
-
- if.then: ; preds = %entry
- %add5 = add nsw i64 %and1, %and
- %sub9 = add nsw i64 %add5, 281474976645135
- %shl = shl i64 %add5, 16
- %or = or i64 %shl, %and1
- br label %cleanup
-
- if.end15: ; preds = %entry
- br i1 undef, label %while.cond.preheader, label %while.cond30.preheader
-
- while.cond30.preheader: ; preds = %if.end15
- br i1 undef, label %while.body33.preheader, label %while.body109.preheader
-
- while.body33.preheader: ; preds = %while.cond30.preheader
- br label %while.body33
-
- while.cond.preheader: ; preds = %if.end15
- %sub25 = add i64 %and1, -65521
- %rem = urem i64 %and, 65521
- %shl27 = shl nuw nsw i64 %rem, 16
- %or28 = or i64 %shl27, %and1
- br label %cleanup
-
- while.body33: ; preds = %do.end, %while.body33.preheader
- %indvar = phi i64 [ %indvar.next, %do.end ], [ 0, %while.body33.preheader ]
- %sum2.2385 = phi i64 [ %rem102, %do.end ], [ %and, %while.body33.preheader ]
- %len.addr.1384 = phi i64 [ %sub34, %do.end ], [ %len, %while.body33.preheader ]
- %buf.addr.1383 = phi i8* [ %scevgep390, %do.end ], [ %buf, %while.body33.preheader ]
- %adler.addr.3382 = phi i64 [ %rem101, %do.end ], [ %and1, %while.body33.preheader ]
- %0 = mul i64 %indvar, 5552
- %1 = add i64 %0, -13
- %scevgep2 = getelementptr i8, i8* %buf, i64 %1
- %sub34 = add i64 %len.addr.1384, -5552
- call void @llvm.ppc.mtctr.i64(i64 347)
- br label %do.body
-
- do.body: ; preds = %do.body, %while.body33
- %adler.addr.4 = phi i64 [ %adler.addr.3382, %while.body33 ], [ %add49, %do.body ]
- %sum2.3 = phi i64 [ %sum2.2385, %while.body33 ], [ %add98, %do.body ]
- %tmp15.phi = phi i8* [ %scevgep2, %while.body33 ], [ %tmp15.inc, %do.body ]
- %tmp15.inc = getelementptr i8, i8* %tmp15.phi, i64 16
- %add38 = add i64 %adler.addr.4, %sum2.3
- %add42 = add i64 %add38, %adler.addr.4
- %add46 = add i64 %add42, %adler.addr.4
- %tmp15 = load i8, i8* %tmp15.inc, align 1, !tbaa !1
- %conv48 = zext i8 %tmp15 to i64
- %add49 = add i64 %adler.addr.4, %conv48
- %add50 = add i64 %add46, %add49
- %add54 = add i64 %add50, %add49
- %add58 = add i64 %add54, %add49
- %add62 = add i64 %add58, %add49
- %add66 = add i64 %add62, %add49
- %add70 = add i64 %add66, %add49
- %add74 = add i64 %add70, %add49
- %add78 = add i64 %add74, %add49
- %add82 = add i64 %add78, %add49
- %add86 = add i64 %add82, %add49
- %add90 = add i64 %add86, %add49
- %add94 = add i64 %add90, %add49
- %add98 = add i64 %add94, %add49
- %2 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
- br i1 %2, label %do.body, label %do.end
-
- do.end: ; preds = %do.body
- %scevgep390 = getelementptr i8, i8* %buf.addr.1383, i64 5552
- %rem101 = urem i64 %add49, 65521
- %rem102 = urem i64 %add98, 65521
- %cmp31 = icmp ugt i64 %sub34, 5551
- %indvar.next = add i64 %indvar, 1
- br i1 %cmp31, label %while.body33, label %while.end103
-
- while.end103: ; preds = %do.end
- br i1 undef, label %if.end188, label %while.body109.preheader
-
- while.body109.preheader: ; preds = %while.end103, %while.cond30.preheader
- %buf.addr.1.lcssa394400 = phi i8* [ %buf, %while.cond30.preheader ], [ %scevgep390, %while.end103 ]
- %arrayidx151 = getelementptr inbounds i8, i8* %buf.addr.1.lcssa394400, i64 10
- %tmp45 = load i8, i8* %arrayidx151, align 1, !tbaa !1
- %conv152 = zext i8 %tmp45 to i64
- br label %while.body109
-
- while.body109: ; preds = %while.body109, %while.body109.preheader
- %adler.addr.5373 = phi i64 [ %add153, %while.body109 ], [ undef, %while.body109.preheader ]
- %add153 = add i64 %adler.addr.5373, %conv152
- br label %while.body109
-
- if.end188: ; preds = %while.end103
- %shl189 = shl nuw nsw i64 %rem102, 16
- %or190 = or i64 %shl189, %rem101
- br label %cleanup
-
- cleanup: ; preds = %if.end188, %while.cond.preheader, %if.then
- %retval.0 = phi i64 [ %or, %if.then ], [ %or28, %while.cond.preheader ], [ %or190, %if.end188 ]
- ret i64 %retval.0
- }
-
- ; Function Attrs: nounwind
- declare void @llvm.ppc.mtctr.i64(i64) #1
-
- ; Function Attrs: nounwind
- declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #1
-
- ; Function Attrs: nounwind
- declare void @llvm.stackprotector(i8*, i8**) #1
-
- attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { nounwind }
-
- !llvm.ident = !{!0}
-
- !0 = !{!"clang version 5.0.0 "}
- !1 = !{!2, !2, i64 0}
- !2 = !{!"omnipotent char", !3, i64 0}
- !3 = !{!"Simple C/C++ TBAA"}
-
-...
----
-name: adler32_z
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%x3' }
- - { reg: '%x4' }
- - { reg: '%x5' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-fixedStack:
- - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
- - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' }
- - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
-body: |
- bb.0.entry:
- successors: %bb.1.if.then(0x40000000), %bb.3.if.end15(0x40000000)
- liveins: %x3, %x4, %x5, %x29, %x30
-
- %x6 = RLWINM8 %x3, 16, 16, 31
- %x3 = RLDICL killed %x3, 0, 48
- BC undef %cr5lt, %bb.3.if.end15
-
- bb.1.if.then:
- successors: %bb.2.if.then(0x80000000)
- liveins: %x3, %x6, %x29, %x30
-
- %x4 = ADD8 %x3, killed %x6
-
- bb.2.if.then:
- liveins: %lr8, %rm, %x3, %x4
-
- %x4 = RLDICR killed %x4, 16, 47
- %x3 = OR8 killed %x4, killed %x3
- BLR8 implicit %lr8, implicit %rm, implicit %x3
-
- bb.3.if.end15:
- successors: %bb.6.while.cond.preheader(0x40000000), %bb.4.while.cond30.preheader(0x40000000)
- liveins: %x3, %x4, %x5, %x6, %x29, %x30
-
- BC undef %cr5lt, %bb.6.while.cond.preheader
-
- bb.4.while.cond30.preheader:
- successors: %bb.7.while.body33.preheader(0x40000000), %bb.5(0x40000000)
- liveins: %x3, %x4, %x5, %x6, %x29, %x30
-
- BCn undef %cr5lt, %bb.7.while.body33.preheader
-
- bb.5:
- successors: %bb.12.while.body109.preheader(0x80000000)
- liveins: %x4, %x29, %x30
-
- %x7 = OR8 %x4, killed %x4
- B %bb.12.while.body109.preheader
-
- bb.6.while.cond.preheader:
- successors: %bb.2.if.then(0x80000000)
- liveins: %x3, %x6, %x29, %x30
-
- %x4 = LIS8 15
- %x4 = ORI8 killed %x4, 225
- %x4 = RLDICR killed %x4, 32, 31
- %x4 = ORIS8 killed %x4, 3375
- %x4 = ORI8 killed %x4, 50637
- %x4 = MULHDU %x6, killed %x4
- %x5 = SUBF8 %x4, %x6
- %x5 = RLDICL killed %x5, 63, 1
- %x4 = ADD8 killed %x5, killed %x4
- %x5 = LI8 0
- %x4 = RLDICL killed %x4, 49, 15
- %x5 = ORI8 killed %x5, 65521
- %x4 = MULLD killed %x4, killed %x5
- %x4 = SUBF8 killed %x4, killed %x6
- B %bb.2.if.then
-
- bb.7.while.body33.preheader:
- successors: %bb.8.while.body33(0x80000000)
- liveins: %x3, %x4, %x5, %x6, %x29, %x30
-
- STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1)
- STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16)
- %x7 = LIS8 15
- %x7 = ORI8 killed %x7, 225
- %x7 = RLDICR killed %x7, 32, 31
- %x8 = LI8 0
- %x7 = ORIS8 killed %x7, 3375
- %x9 = LI8 347
- %x10 = ORI8 killed %x7, 50637
- %x11 = ORI8 %x8, 65521
- %x7 = OR8 %x4, %x4
-
- bb.8.while.body33:
- successors: %bb.9.do.body(0x80000000)
- liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11
-
- %x12 = MULLI8 %x8, 5552
- %x12 = ADD8 %x4, killed %x12
- %x12 = ADDI8 killed %x12, -13
- %x5 = ADDI8 killed %x5, -5552
- MTCTR8loop %x9, implicit-def dead %ctr8
-
- bb.9.do.body:
- successors: %bb.9.do.body(0x7c000000), %bb.10.do.end(0x04000000)
- liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12
-
- %x0, %x12 = LBZU8 16, killed %x12 :: (load 1 from %ir.tmp15.inc, !tbaa !1)
- %x6 = ADD8 %x3, killed %x6
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x3 = ADD8 killed %x3, killed %x0
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- %x6 = ADD8 killed %x6, %x3
- BDNZ8 %bb.9.do.body, implicit-def %ctr8, implicit %ctr8
-
- bb.10.do.end:
- successors: %bb.8.while.body33(0x7c000000), %bb.11.while.end103(0x04000000)
- liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11
-
- %x12 = MULHDU %x3, %x10
- %x0 = MULHDU %x6, %x10
- %x30 = SUBF8 %x12, %x3
- %x29 = SUBF8 %x0, %x6
- %x30 = RLDICL killed %x30, 63, 1
- %x29 = RLDICL killed %x29, 63, 1
- %x12 = ADD8 killed %x30, killed %x12
- %x0 = ADD8 killed %x29, killed %x0
- %cr0 = CMPLDI %x5, 5551
- %x12 = RLDICL killed %x12, 49, 15
- %x0 = RLDICL killed %x0, 49, 15
- %x12 = MULLD killed %x12, %x11
- %x0 = MULLD killed %x0, %x11
- %x7 = ADDI8 killed %x7, 5552
- %x3 = SUBF8 killed %x12, killed %x3
- %x6 = SUBF8 killed %x0, killed %x6
- %x8 = ADDI8 killed %x8, 1
- BCC 44, killed %cr0, %bb.8.while.body33
-
- bb.11.while.end103:
- successors: %bb.14.if.end188(0x40000000), %bb.12.while.body109.preheader(0x40000000)
- liveins: %x3, %x6, %x7
-
- %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16)
- %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1)
- BC undef %cr5lt, %bb.14.if.end188
-
- bb.12.while.body109.preheader:
- successors: %bb.13.while.body109(0x80000000)
- liveins: %x7, %x29, %x30
-
- %x3 = LBZ8 10, killed %x7 :: (load 1 from %ir.arrayidx151, !tbaa !1)
- %x4 = IMPLICIT_DEF
-
- bb.13.while.body109:
- successors: %bb.13.while.body109(0x80000000)
- liveins: %x3, %x4, %x29, %x30
-
- %x4 = ADD8 killed %x4, %x3
- B %bb.13.while.body109
-
- bb.14.if.end188:
- liveins: %x3, %x6, %x29, %x30
-
- %x4 = RLDICR killed %x6, 16, 47
- %x3 = OR8 killed %x4, killed %x3
- BLR8 implicit %lr8, implicit %rm, implicit %x3
-
-...
diff --git a/test/CodeGen/PowerPC/testComparesieqsll.ll b/test/CodeGen/PowerPC/testComparesieqsll.ll
new file mode 100644
index 0000000000000..57c7365eff03a
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqsll.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqsll.c'
+
+@glob = common local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsll(i64 %a, i64 %b) {
+; CHECK-LABEL: test_ieqsll:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsll_sext(i64 %a, i64 %b) {
+; CHECK-LABEL: test_ieqsll_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsll_z(i64 %a) {
+; CHECK-LABEL: test_ieqsll_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsll_sext_z(i64 %a) {
+; CHECK-LABEL: test_ieqsll_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsll_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_ieqsll_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsll_sext_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_ieqsll_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsll_z_store(i64 %a) {
+; CHECK-LABEL: test_ieqsll_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsll_sext_z_store(i64 %a) {
+; CHECK-LABEL: test_ieqsll_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequll.ll b/test/CodeGen/PowerPC/testComparesiequll.ll
new file mode 100644
index 0000000000000..c289290718455
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequll.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequll.c'
+
+@glob = common local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequll(i64 %a, i64 %b) {
+; CHECK-LABEL: test_iequll:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequll_sext(i64 %a, i64 %b) {
+; CHECK-LABEL: test_iequll_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequll_z(i64 %a) {
+; CHECK-LABEL: test_iequll_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequll_sext_z(i64 %a) {
+; CHECK-LABEL: test_iequll_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequll_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_iequll_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequll_sext_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_iequll_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequll_z_store(i64 %a) {
+; CHECK-LABEL: test_iequll_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequll_sext_z_store(i64 %a) {
+; CHECK-LABEL: test_iequll_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqsll.ll b/test/CodeGen/PowerPC/testCompareslleqsll.ll
new file mode 100644
index 0000000000000..4797ddfbfe970
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqsll.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsll(i64 %a, i64 %b) {
+; CHECK-LABEL: test_lleqsll:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsll_sext(i64 %a, i64 %b) {
+; CHECK-LABEL: test_lleqsll_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsll_z(i64 %a) {
+; CHECK-LABEL: test_lleqsll_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsll_sext_z(i64 %a) {
+; CHECK-LABEL: test_lleqsll_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsll_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_lleqsll_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsll_sext_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_lleqsll_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsll_z_store(i64 %a) {
+; CHECK-LABEL: test_lleqsll_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsll_sext_z_store(i64 %a) {
+; CHECK-LABEL: test_lleqsll_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequll.ll b/test/CodeGen/PowerPC/testComparesllequll.ll
new file mode 100644
index 0000000000000..4dc7be69d2c8d
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequll.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequll(i64 %a, i64 %b) {
+; CHECK-LABEL: test_llequll:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequll_sext(i64 %a, i64 %b) {
+; CHECK-LABEL: test_llequll_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequll_z(i64 %a) {
+; CHECK-LABEL: test_llequll_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequll_sext_z(i64 %a) {
+; CHECK-LABEL: test_llequll_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequll_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_llequll_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequll_sext_store(i64 %a, i64 %b) {
+; CHECK-LABEL: test_llequll_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequll_z_store(i64 %a) {
+; CHECK-LABEL: test_llequll_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzd r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicl r3, r3, 58, 63
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequll_sext_z_store(i64 %a) {
+; CHECK-LABEL: test_llequll_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: addic r3, r3, -1
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: subfe r3, r3, r3
+; CHECK-NEXT: std r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i64 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ store i64 %conv1, i64* @glob, align 8
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_xxpermdi.ll b/test/CodeGen/PowerPC/vec_xxpermdi.ll
new file mode 100644
index 0000000000000..9be2a1864a04e
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_xxpermdi.ll
@@ -0,0 +1,307 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-BE
+
+; Possible LE ShuffleVector masks (Case 1):
+; ShuffleVector((vector double)a, (vector double)b, 3, 1)
+; ShuffleVector((vector double)a, (vector double)b, 2, 1)
+; ShuffleVector((vector double)a, (vector double)b, 3, 0)
+; ShuffleVector((vector double)a, (vector double)b, 2, 0)
+; which targets at:
+; xxpermdi a, b, 0
+; xxpermdi a, b, 1
+; xxpermdi a, b, 2
+; xxpermdi a, b, 3
+; Possible LE Swap ShuffleVector masks (Case 2):
+; ShuffleVector((vector double)a, (vector double)b, 1, 3)
+; ShuffleVector((vector double)a, (vector double)b, 0, 3)
+; ShuffleVector((vector double)a, (vector double)b, 1, 2)
+; ShuffleVector((vector double)a, (vector double)b, 0, 2)
+; which targets at:
+; xxpermdi b, a, 0
+; xxpermdi b, a, 1
+; xxpermdi b, a, 2
+; xxpermdi b, a, 3
+; Possible LE ShuffleVector masks when a == b, b is undef (Case 3):
+; ShuffleVector((vector double)a, (vector double)a, 1, 1)
+; ShuffleVector((vector double)a, (vector double)a, 0, 1)
+; ShuffleVector((vector double)a, (vector double)a, 1, 0)
+; ShuffleVector((vector double)a, (vector double)a, 0, 0)
+; which targets at:
+; xxpermdi a, a, 0
+; xxpermdi a, a, 1
+; xxpermdi a, a, 2
+; xxpermdi a, a, 3
+
+; Possible BE ShuffleVector masks (Case 4):
+; ShuffleVector((vector double)a, (vector double)b, 0, 2)
+; ShuffleVector((vector double)a, (vector double)b, 0, 3)
+; ShuffleVector((vector double)a, (vector double)b, 1, 2)
+; ShuffleVector((vector double)a, (vector double)b, 1, 3)
+; which targets at:
+; xxpermdi a, b, 0
+; xxpermdi a, b, 1
+; xxpermdi a, b, 2
+; xxpermdi a, b, 3
+; Possible BE Swap ShuffleVector masks (Case 5):
+; ShuffleVector((vector double)a, (vector double)b, 2, 0)
+; ShuffleVector((vector double)a, (vector double)b, 3, 0)
+; ShuffleVector((vector double)a, (vector double)b, 2, 1)
+; ShuffleVector((vector double)a, (vector double)b, 3, 1)
+; which targets at:
+; xxpermdi b, a, 0
+; xxpermdi b, a, 1
+; xxpermdi b, a, 2
+; xxpermdi b, a, 3
+; Possible BE ShuffleVector masks when a == b, b is undef (Case 6):
+; ShuffleVector((vector double)a, (vector double)a, 0, 0)
+; ShuffleVector((vector double)a, (vector double)a, 0, 1)
+; ShuffleVector((vector double)a, (vector double)a, 1, 0)
+; ShuffleVector((vector double)a, (vector double)a, 1, 1)
+; which targets at:
+; xxpermdi a, a, 0
+; xxpermdi a, a, 1
+; xxpermdi a, a, 2
+; xxpermdi a, a, 3
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-LE: xxmrghd 34, 34, 35
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-LE: xxpermdi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-LE: xxpermdi 34, 34, 35, 2
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-LE: xxmrgld 34, 34, 35
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-LE: xxmrghd 34, 35, 34
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-LE: xxpermdi 34, 35, 34, 2
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-LE: xxmrgld 34, 35, 34
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_0
+; CHECK-LE: xxspltd 34, 34, 0
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_2
+; CHCECK-LE: xxswapd 34, 34
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_3
+; CHECK-LE: xxspltd 34, 34, 1
+; CHECK-LE: blr
+}
+
+; Start testing BE
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-BE: xxmrghd 34, 34, 35
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-BE: xxpermdi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-BE: xxpermdi 34, 34, 35, 2
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-BE: xxmrgld 34, 34, 35
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-BE: xxmrghd 34, 35, 34
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-BE: xxpermdi 34, 35, 34, 1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-BE: xxpermdi 34, 35, 34, 2
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-BE: xxmrgld 34, 35, 34
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_0
+; CHECK-BE: xxspltd 34, 34, 0
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_2
+; CHCECK-LE: xxswapd 34, 34
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_3
+; CHECK-BE: xxspltd 34, 34, 1
+; CHECK-BE: blr
+}
+
+; More test cases to test different types of vector inputs
+define <16 x i8> @test_be_vec_xxpermdi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) {
+ entry:
+ %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x i8> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v16i8_v16i8
+; CHECK-BE: xxpermdi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <8 x i16> @test_le_swap_vec_xxpermdi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) {
+ entry:
+ %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i16> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v8i16_v8i16
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @test_le_swap_vec_xxpermdi_v4i32_v4i32(<4 x i32> %VA, <4 x i32> %VB) {
+ entry:
+ %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB,<4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v4i32_v4i32
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}
diff --git a/test/CodeGen/Thumb2/tbb-removeadd.mir b/test/CodeGen/Thumb2/tbb-removeadd.mir
index 89ed987205394..1060667913439 100644
--- a/test/CodeGen/Thumb2/tbb-removeadd.mir
+++ b/test/CodeGen/Thumb2/tbb-removeadd.mir
@@ -39,7 +39,6 @@
name: Func
alignment: 1
exposesReturnsTwice: false
-noVRegs: true
legalized: false
regBankSelected: false
selected: false
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 4ec703921e29f..24aa5b98d0bb8 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {
; CHECK: mulss
; CHECK: mulss
-; CHECK: mulss
-; CHECK: mulss
; CHECK: addss
+; CHECK: mulss
; CHECK: addss
+; CHECK: mulss
; CHECK: addss
; CHECK: ret
}
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
index bc394f6e156fb..6c60aed67a7ba 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
@@ -5,7 +5,6 @@ define void @test_void_return() {
; CHECK-LABEL: name: test_void_return
; CHECK: alignment: 4
; CHECK-NEXT: exposesReturnsTwice: false
-; CHECK-NEXT: noVRegs: false
; CHECK-NEXT: legalized: false
; CHECK-NEXT: regBankSelected: false
; CHECK-NEXT: selected: false
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index b9f7fc68cf689..ad82b8cfb775e 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -9,9 +9,11 @@
define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: adcl $0, %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: addl %ecx, %edx
+; CHECK-NEXT: adcl %ecx, %eax
; CHECK-NEXT: retl
%add4 = add i32 %x, %sum
%cmp = icmp ult i32 %add4, %x
diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll
index 3f4ee362e230f..3c84af4aa9ec6 100644
--- a/test/CodeGen/X86/addcarry.ll
+++ b/test/CodeGen/X86/addcarry.ll
@@ -86,21 +86,14 @@ entry:
define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
; CHECK-LABEL: pr31719:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: xorl %r10d, %r10d
-; CHECK-NEXT: addq 8(%rsi), %rcx
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: addq 16(%rsi), %r8
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 24(%rsi), %r9
; CHECK-NEXT: addq (%rsi), %rdx
-; CHECK-NEXT: adcq $0, %rcx
-; CHECK-NEXT: adcq %r8, %r10
-; CHECK-NEXT: adcq %r9, %rax
+; CHECK-NEXT: adcq 8(%rsi), %rcx
+; CHECK-NEXT: adcq 16(%rsi), %r8
+; CHECK-NEXT: adcq 24(%rsi), %r9
; CHECK-NEXT: movq %rdx, (%rdi)
; CHECK-NEXT: movq %rcx, 8(%rdi)
-; CHECK-NEXT: movq %r10, 16(%rdi)
-; CHECK-NEXT: movq %rax, 24(%rdi)
+; CHECK-NEXT: movq %r8, 16(%rdi)
+; CHECK-NEXT: movq %r9, 24(%rdi)
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq
entry:
@@ -190,9 +183,9 @@ entry:
define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) {
; CHECK-LABEL: shiftadd:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: leaq (%rdx,%rcx), %rax
; CHECK-NEXT: addq %rsi, %rdi
-; CHECK-NEXT: adcq $0, %rax
+; CHECK-NEXT: adcq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: retq
entry:
%0 = zext i64 %a to i128
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 2aaf14001758f..aa28ef5175ed6 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-LABEL: avg_v32i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm8
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm6, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm11, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm10, %xmm7
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: paddd %xmm9, %xmm13
-; SSE2-NEXT: paddd %xmm15, %xmm2
-; SSE2-NEXT: paddd %xmm14, %xmm5
-; SSE2-NEXT: paddd %xmm8, %xmm0
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm3
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm13
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm7
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm7, %xmm3
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm3
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm4, %xmm9
; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm4, %xmm13
+; SSE2-NEXT: packuswb %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm13, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; SSE2-LABEL: avg_v64i8:
; SSE2: # BB#0:
-; SSE2-NEXT: subq $152, %rsp
-; SSE2-NEXT: .Lcfi0:
-; SSE2-NEXT: .cfi_def_cfa_offset 160
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-NEXT: movdqa 32(%rdi), %xmm5
-; SSE2-NEXT: movdqa 48(%rdi), %xmm6
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm1
+; SSE2-NEXT: movdqa 48(%rdi), %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa (%rsi), %xmm5
+; SSE2-NEXT: movdqa 16(%rsi), %xmm13
+; SSE2-NEXT: movdqa 32(%rsi), %xmm11
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm2, %xmm15
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm15, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm10, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm10
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm5
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsi), %xmm14
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm14, %xmm12
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE2-NEXT: movdqa 16(%rsi), %xmm12
-; SSE2-NEXT: movdqa %xmm12, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm15, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm8, %xmm15
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm13
+; SSE2-NEXT: movdqa %xmm11, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: movdqa 32(%rsi), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm7, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm11, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm2, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa 48(%rsi), %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: movdqa 48(%rsi), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm8, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
-; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: paddd %xmm0, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm15
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: paddd %xmm0, %xmm14
; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm10
-; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm0, %xmm14
; SSE2-NEXT: paddd %xmm0, %xmm11
-; SSE2-NEXT: paddd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm15
+; SSE2-NEXT: paddd %xmm0, %xmm5
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: psrld $1, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: packuswb %xmm1, %xmm10
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: packuswb %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm12
+; SSE2-NEXT: pand %xmm0, %xmm12
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: packuswb %xmm12, %xmm4
+; SSE2-NEXT: psrld $1, %xmm13
+; SSE2-NEXT: psrld $1, %xmm15
; SSE2-NEXT: pand %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: packuswb %xmm15, %xmm7
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: pand %xmm0, %xmm13
+; SSE2-NEXT: packuswb %xmm15, %xmm13
+; SSE2-NEXT: packuswb %xmm4, %xmm13
+; SSE2-NEXT: psrld $1, %xmm6
; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm14
-; SSE2-NEXT: packuswb %xmm9, %xmm14
-; SSE2-NEXT: packuswb %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm0, %xmm13
; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm13, %xmm6
-; SSE2-NEXT: psrld $1, %xmm12
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: packuswb %xmm10, %xmm12
-; SSE2-NEXT: packuswb %xmm6, %xmm12
-; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: packuswb %xmm9, %xmm6
; SSE2-NEXT: psrld $1, %xmm11
+; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: pand %xmm0, %xmm14
; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: packuswb %xmm11, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm5
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: packuswb %xmm14, %xmm11
+; SSE2-NEXT: packuswb %xmm6, %xmm11
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm5, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: packuswb %xmm5, %xmm7
+; SSE2-NEXT: packuswb %xmm3, %xmm7
+; SSE2-NEXT: movdqu %xmm7, (%rax)
+; SSE2-NEXT: movdqu %xmm11, (%rax)
+; SSE2-NEXT: movdqu %xmm13, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm12, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
-; SSE2-NEXT: addq $152, %rsp
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v64i8:
@@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7
-; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6
-; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5
-; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10
@@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2
-; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4
; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
@@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-LABEL: avg_v16i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm2
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm0
@@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-LABEL: avg_v32i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm10
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm4
+; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa 32(%rdi), %xmm10
; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm14
+; SSE2-NEXT: movdqa (%rsi), %xmm9
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm11, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm8, %xmm13
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm6
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm13, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm15, %xmm5
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm14
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm14
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm14
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: packssdw %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: pslld $16, %xmm9
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: packssdw %xmm7, %xmm9
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm3
@@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
+; SSE2-NEXT: movdqu %xmm9, (%rax)
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v32i16:
@@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
@@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
@@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-LABEL: avg_v32i8_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm8
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm6, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm11, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm10, %xmm7
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: paddd %xmm9, %xmm13
-; SSE2-NEXT: paddd %xmm15, %xmm2
-; SSE2-NEXT: paddd %xmm14, %xmm5
-; SSE2-NEXT: paddd %xmm8, %xmm0
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm3
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm13
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm7
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm7, %xmm3
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm3
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm4, %xmm9
; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm4, %xmm13
+; SSE2-NEXT: packuswb %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm13, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-LABEL: avg_v16i16_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm2
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm0
@@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-LABEL: avg_v32i16_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm10
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm4
+; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa 32(%rdi), %xmm10
; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm14
+; SSE2-NEXT: movdqa (%rsi), %xmm9
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm11, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm8, %xmm13
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm6
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm13, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm15, %xmm5
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm14
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm14
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm14
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: packssdw %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: pslld $16, %xmm9
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: packssdw %xmm7, %xmm9
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm3
@@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
+; SSE2-NEXT: movdqu %xmm9, (%rax)
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_2:
@@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
@@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 341dd867e4ff4..647b7a8f4dfca 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; CHECK-NOT: mov
; CHECK: insertps $48
; CHECK: insertps $48
+; CHECK: vaddps
; CHECK: insertps $48
; CHECK: insertps $48
; CHECK: vaddps
; CHECK: vaddps
-; CHECK: vaddps
; CHECK-NEXT: ret
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index 63b0281a73399..e29cf09718ad9 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0
; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1
-; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2
-; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3
; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: korw %k3, %k2, %k1
+; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2
+; CHECK-NEXT: korw %k2, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 4890afec2164b..c03623a2f0359 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
; CHECK-NEXT: movw $1, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm4
-; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
; CHECK-NEXT: movw $220, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 32da0a70218e3..431223611faea 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float>
; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32>
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
@@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16
; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16
; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
@@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x
; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
@@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
@@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
@@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
@@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 563cad04b8c2d..b04c1ab38e559 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -479,11 +479,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
@@ -498,11 +498,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2si %xmm0, %rax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
@@ -517,11 +517,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2usi %xmm0, %rax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
@@ -536,11 +536,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %rcx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2si %xmm0, %rax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
@@ -555,11 +555,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
@@ -574,11 +574,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2si %xmm0, %eax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
@@ -593,11 +593,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2usi %xmm0, %eax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
@@ -612,11 +612,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %ecx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2si %xmm0, %eax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
@@ -685,8 +685,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
-; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
@@ -4398,8 +4399,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vprold $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
@@ -4418,8 +4419,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
@@ -4520,9 +4521,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
@@ -4543,9 +4544,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vmovapd %zmm0, %zmm5
; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4612,9 +4613,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vmovaps %zmm0, %zmm5
; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
+; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index 4ef88ac495c32..96aefdb105845 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
; CHECK-NEXT: Lcfi3:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
+; CHECK-NEXT: kord %k1, %k0, %k0
; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
-; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
-; CHECK-NEXT: kord %k1, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
; CHECK-LABEL: test_64i1:
; CHECK: ## BB#0:
-; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: pushq %rax
; CHECK-NEXT: Lcfi4:
-; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
-; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
-; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
-; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload
; CHECK-NEXT: vpmovm2b %k0, %zmm0
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
%cmp_res = icmp ugt <64 x i8> %a, %b
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 9b4e73a18fc28..faa055dfbbf3f 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
@@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
@@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 51f9a382ccbfd..ca01033bf78ba 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
@@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
@@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 7df07b0413ed4..571f345d4616b 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
@@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
@@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16>
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
@@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
@@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
@@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16>
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
@@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
@@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1,
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index 8f528394f5bd5..f8f47c87100ad 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32>
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 37aea45e6107d..96254f7c95b0f 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index cf79819734a2d..636358fb91cbd 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -39,8 +39,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 06ee237593e79..d54208c00987c 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -404,8 +404,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0,
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
@@ -424,8 +424,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 52a84deebf519..595b3e0ebb863 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0,
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
@@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3]
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
@@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index ad9ea93c20311..1bfdfd0e634de 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
@@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 30ecc0d2e49e5..9659dc6d455af 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index 3ca686cef3bf4..b2fe6eba88aba 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index 4d906a4fd29a2..c2d8df6476b3e 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
@@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %
; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %
; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
-; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
@@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
-; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
@@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]
; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3]
; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0]
-; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb]
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1)
@@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3]
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1]
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1]
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1]
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1]
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
-; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
-; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
-; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
-; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32>
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1]
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1)
@@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3]
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
@@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 1f324d6795649..684b0468cf518 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc]
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
@@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
+; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
@@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
+; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index a681c3b0aa429..092b139fca2f9 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -6,68 +6,35 @@
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
-; SSE2-SSSE3-LABEL: v8i16:
-; SSE2-SSSE3: ## BB#0:
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movd %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: v8i16:
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8i16:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
; AVX12: ## BB#0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrw $7, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $6, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $5, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $4, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
@@ -90,22 +57,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movd %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
@@ -113,19 +66,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrd $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
@@ -149,22 +91,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movd %xmm3, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
@@ -172,19 +100,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrd $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
@@ -208,56 +125,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
@@ -265,55 +134,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrb $15, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $14, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $13, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $12, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $11, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $10, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $9, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $8, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $7, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $6, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $5, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $4, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $3, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $2, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $1, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrb $0, %xmm0, %eax
-; AVX12-NEXT: andb $1, %al
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
@@ -383,14 +205,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
@@ -405,26 +221,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
@@ -439,26 +250,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i8:
@@ -537,14 +343,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
@@ -559,26 +359,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
@@ -593,26 +388,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i16:
@@ -683,14 +473,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
@@ -703,24 +487,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
@@ -733,24 +512,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
@@ -801,14 +575,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
@@ -816,13 +584,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrq $1, %xmm0, %rax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovq %xmm0, %rax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
@@ -846,14 +609,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movq %xmm3, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
@@ -861,13 +618,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrq $1, %xmm0, %rax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovq %xmm0, %rax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
@@ -892,29 +644,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-NEXT: psrad $24, %xmm3
; SSE2-SSSE3-NEXT: pslld $24, %xmm2
; SSE2-SSSE3-NEXT: psrad $24, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pslld $24, %xmm1
; SSE2-SSSE3-NEXT: psrad $24, %xmm1
; SSE2-SSSE3-NEXT: pslld $24, %xmm0
; SSE2-SSSE3-NEXT: psrad $24, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movd %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
@@ -923,26 +661,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
-; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrd $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
@@ -975,29 +702,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-NEXT: psrad $16, %xmm3
; SSE2-SSSE3-NEXT: pslld $16, %xmm2
; SSE2-SSSE3-NEXT: psrad $16, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pslld $16, %xmm1
; SSE2-SSSE3-NEXT: psrad $16, %xmm1
; SSE2-SSSE3-NEXT: pslld $16, %xmm0
; SSE2-SSSE3-NEXT: psrad $16, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movd %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
@@ -1006,26 +719,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
-; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrd $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrd $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
@@ -1052,45 +754,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
}
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
-; SSE2-SSSE3-LABEL: v8i8:
-; SSE2-SSSE3: ## BB#0:
-; SSE2-SSSE3-NEXT: psllw $8, %xmm3
-; SSE2-SSSE3-NEXT: psraw $8, %xmm3
-; SSE2-SSSE3-NEXT: psllw $8, %xmm2
-; SSE2-SSSE3-NEXT: psraw $8, %xmm2
-; SSE2-SSSE3-NEXT: psllw $8, %xmm1
-; SSE2-SSSE3-NEXT: psraw $8, %xmm1
-; SSE2-SSSE3-NEXT: psllw $8, %xmm0
-; SSE2-SSSE3-NEXT: psraw $8, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movd %xmm2, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: v8i8:
+; SSE2: ## BB#0:
+; SSE2-NEXT: psllw $8, %xmm3
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: psllw $8, %xmm2
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8i8:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: psllw $8, %xmm3
+; SSSE3-NEXT: psraw $8, %xmm3
+; SSSE3-NEXT: psllw $8, %xmm2
+; SSSE3-NEXT: psraw $8, %xmm2
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: psllw $8, %xmm1
+; SSSE3-NEXT: psraw $8, %xmm1
+; SSSE3-NEXT: psllw $8, %xmm0
+; SSSE3-NEXT: psraw $8, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
; AVX12: ## BB#0:
@@ -1098,38 +797,16 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2
+; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
-; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpextrw $7, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $6, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $5, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $4, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $3, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $2, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vpextrw $1, %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: vmovd %xmm0, %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 06b1a76f6baed..a6d6ca155302e 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -1,8 +1,83 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; SSE2-SSSE3-LABEL: v4i64:
+; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm9
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm10, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm9, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v4i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v4i64:
; AVX2: ## BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
@@ -12,19 +87,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vmovmskps %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -45,30 +109,36 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
}
define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
-; AVX2-LABEL: v4f64:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; SSE2-SSSE3-LABEL: v4f64:
+; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: cmpltpd %xmm5, %xmm7
+; SSE2-SSSE3-NEXT: cmpltpd %xmm4, %xmm6
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm6
+; SSE2-SSSE3-NEXT: psrad $31, %xmm6
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
+; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v4f64:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f64:
; AVX512: ## BB#0:
@@ -87,6 +157,78 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
}
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
+; SSE2-LABEL: v16i16:
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v16i16:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
+; SSSE3-NEXT: pshufb %xmm3, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: psllw $7, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1
+; SSSE3-NEXT: pcmpgtw %xmm7, %xmm5
+; SSSE3-NEXT: pshufb %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: psllw $7, %xmm4
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v16i16:
; AVX2: ## BB#0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
@@ -96,55 +238,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -164,6 +259,79 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
}
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
+; SSE2-LABEL: v8i32:
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8i32:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pshufb %xmm3, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: psllw $15, %xmm0
+; SSSE3-NEXT: psraw $15, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pshufb %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: psllw $15, %xmm4
+; SSSE3-NEXT: psraw $15, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm4, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v8i32:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v8i32:
; AVX2: ## BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
@@ -173,31 +341,9 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -217,42 +363,74 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
}
define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; AVX2-LABEL: v8f32:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vcmpltps %ymm2, %ymm3, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; SSE2-LABEL: v8f32:
+; SSE2: ## BB#0:
+; SSE2-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: cmpltps %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm4, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8f32:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: cmpltps %xmm1, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: cmpltps %xmm0, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT: psllw $15, %xmm2
+; SSSE3-NEXT: psraw $15, %xmm2
+; SSSE3-NEXT: cmpltps %xmm5, %xmm7
+; SSSE3-NEXT: pshufb %xmm1, %xmm7
+; SSSE3-NEXT: cmpltps %xmm4, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSSE3-NEXT: psllw $15, %xmm6
+; SSSE3-NEXT: psraw $15, %xmm6
+; SSSE3-NEXT: pand %xmm2, %xmm6
+; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm6, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v8f32:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v8f32:
; AVX512: ## BB#0:
@@ -270,121 +448,250 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
}
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
+; SSE2-SSSE3-LABEL: v32i8:
+; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: shll $16, %ecx
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: orl %ecx, %eax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v32i8:
; AVX2: ## BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: Lcfi1:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $32, %rsp
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $15, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: movl (%rsp), %eax
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index d1508f99fc71e..9bf7b41a4f26a 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -1,69 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-SSSE3-LABEL: v8i16:
-; SSE2-SSSE3: ## BB#0:
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: v8i16:
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
;
-; AVX1-LABEL: v8i16:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrw $7, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $4, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; SSSE3-LABEL: v8i16:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v8i16:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
; AVX512: ## BB#0:
@@ -80,41 +46,16 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-SSSE3-LABEL: v4i32:
; SSE2-SSSE3: ## BB#0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v4i32:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v4i32:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
; AVX512: ## BB#0:
@@ -132,42 +73,16 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
; SSE2-SSSE3-LABEL: v4f32:
; SSE2-SSSE3: ## BB#0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movaps %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v4f32:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vextractps $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vextractps $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vextractps $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vextractps $0, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v4f32:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
; AVX512: ## BB#0:
@@ -185,111 +100,16 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-SSSE3-LABEL: v16i8:
; SSE2-SSSE3: ## BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v16i8:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: retq
+; AVX12-LABEL: v16i8:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
; AVX512: ## BB#0:
@@ -330,14 +150,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movq %xmm1, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
@@ -353,15 +167,27 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
+; AVX2-LABEL: v2i8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: v2i8:
; AVX512: ## BB#0:
; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1
@@ -406,14 +232,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movq %xmm1, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
@@ -429,15 +249,27 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
+; AVX2-LABEL: v2i16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: v2i16:
; AVX512: ## BB#0:
; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1
@@ -478,14 +310,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movq %xmm1, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
@@ -499,15 +325,25 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT: vmovmskpd %xmm0, %eax
+; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
+; AVX2-LABEL: v2i32:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskpd %xmm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: v2i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
@@ -538,27 +374,16 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movq %xmm1, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2i64:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v2i64:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
; AVX512: ## BB#0:
@@ -576,27 +401,16 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
; SSE2-SSSE3-LABEL: v2f64:
; SSE2-SSSE3: ## BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movq %xmm1, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-SSSE3-NEXT: movq %xmm0, %rax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2f64:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v2f64:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
; AVX512: ## BB#0:
@@ -618,45 +432,20 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; SSE2-SSSE3-NEXT: pslld $24, %xmm0
; SSE2-SSSE3-NEXT: psrad $24, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v4i8:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v4i8:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
; AVX512: ## BB#0:
@@ -682,45 +471,20 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-SSSE3-NEXT: pslld $16, %xmm0
; SSE2-SSSE3-NEXT: psrad $16, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-SSSE3-NEXT: movd %xmm1, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v4i16:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; AVX12-LABEL: v4i16:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskps %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
; AVX512: ## BB#0:
@@ -739,73 +503,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
}
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
-; SSE2-SSSE3-LABEL: v8i8:
-; SSE2-SSSE3: ## BB#0:
-; SSE2-SSSE3-NEXT: psllw $8, %xmm1
-; SSE2-SSSE3-NEXT: psraw $8, %xmm1
-; SSE2-SSSE3-NEXT: psllw $8, %xmm0
-; SSE2-SSSE3-NEXT: psraw $8, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movd %xmm0, %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: v8i8:
+; SSE2: ## BB#0:
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
;
-; AVX1-LABEL: v8i8:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrw $7, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $4, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX1-NEXT: retq
+; SSSE3-LABEL: v8i8:
+; SSSE3: ## BB#0:
+; SSSE3-NEXT: psllw $8, %xmm1
+; SSSE3-NEXT: psraw $8, %xmm1
+; SSSE3-NEXT: psllw $8, %xmm0
+; SSSE3-NEXT: psraw $8, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v8i8:
+; AVX12: ## BB#0:
+; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
; AVX512: ## BB#0:
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index 51c6ad7c7f9ef..b2c619c48d4d3 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -8,55 +8,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -76,33 +29,8 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX2-LABEL: v8i32:
; AVX2: ## BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -122,33 +50,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
; AVX2-LABEL: v8f32:
; AVX2: ## BB#0:
; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -167,117 +70,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX2-LABEL: v32i8:
; AVX2: ## BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: Lcfi1:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $32, %rsp
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $15, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm1, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: movb %al, (%rsp)
-; AVX2-NEXT: movl (%rsp), %eax
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -296,21 +90,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-LABEL: v4i64:
; AVX2: ## BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vmovmskpd %ymm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -331,21 +112,8 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
; AVX2-LABEL: v4f64:
; AVX2: ## BB#0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT: vmovmskpd %ymm0, %eax
+; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll
index a9c74df9d0d91..1340b7662a7ad 100644
--- a/test/CodeGen/X86/bswap_tree2.ll
+++ b/test/CodeGen/X86/bswap_tree2.ll
@@ -9,31 +9,32 @@
define i32 @test1(i32 %x) nounwind {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; CHECK-NEXT: shll $8, %edx
-; CHECK-NEXT: shrl $8, %eax
-; CHECK-NEXT: bswapl %ecx
-; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000
+; CHECK-NEXT: shll $8, %ecx
+; CHECK-NEXT: shrl $8, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: bswapl %eax
+; CHECK-NEXT: shrl $16, %eax
; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test1:
; CHECK64: # BB#0:
-; CHECK64-NEXT: movl %edi, %ecx
-; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000
; CHECK64-NEXT: movl %edi, %eax
-; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; CHECK64-NEXT: shll $8, %ecx
-; CHECK64-NEXT: shrl $8, %eax
+; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000
+; CHECK64-NEXT: movl %edi, %ecx
+; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; CHECK64-NEXT: shll $8, %eax
+; CHECK64-NEXT: shrl $8, %ecx
+; CHECK64-NEXT: orl %eax, %ecx
; CHECK64-NEXT: bswapl %edi
; CHECK64-NEXT: shrl $16, %edi
-; CHECK64-NEXT: orl %ecx, %eax
-; CHECK64-NEXT: orl %edi, %eax
+; CHECK64-NEXT: orl %ecx, %edi
+; CHECK64-NEXT: movl %edi, %eax
; CHECK64-NEXT: retq
%byte0 = and i32 %x, 255 ; 0x000000ff
%byte1 = and i32 %x, 65280 ; 0x0000ff00
diff --git a/test/CodeGen/X86/eh-unknown.ll b/test/CodeGen/X86/eh-unknown.ll
new file mode 100644
index 0000000000000..7c495bdadc676
--- /dev/null
+++ b/test/CodeGen/X86/eh-unknown.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+; An unknown personality forces us to emit an Itanium LSDA. Make sure that the
+; Itanium call site table actually tells the personality to keep unwinding,
+; i.e. we have an entry and it says "has no landing pad".
+
+declare void @throwit()
+declare void @__unknown_ehpersonality(...)
+
+define void @use_unknown_ehpersonality()
+ personality void (...)* @__unknown_ehpersonality {
+entry:
+ call void @throwit()
+ unreachable
+}
+
+; CHECK-LABEL: use_unknown_ehpersonality:
+; CHECK: .Lfunc_begin0:
+; CHECK: .seh_handler __unknown_ehpersonality, @unwind, @except
+; CHECK: callq throwit
+; CHECK: .Lfunc_end0:
+; CHECK: .seh_handlerdata
+; CHECK: .Lexception0:
+; CHECK: .byte 255 # @LPStart Encoding = omit
+; CHECK: .byte 0 # @TType Encoding = absptr
+; CHECK: .asciz "\217\200" # @TType base offset
+; CHECK: .byte 3 # Call site Encoding = udata4
+; CHECK: .byte 13 # Call site table length
+; CHECK: .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 <<
+; CHECK: .long .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0
+; CHECK: .long 0 # has no landing pad
+; CHECK: .byte 0 # On action: cleanup
diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll
index bd8888966cf2c..338a95f6a80cd 100644
--- a/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/test/CodeGen/X86/fmsubadd-combine.ll
@@ -117,9 +117,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou
; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2
; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3
; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1
+; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
-; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_pd512:
@@ -137,9 +137,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou
; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2
; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3
; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1
+; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
-; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
; FMA4-NEXT: retq
entry:
%AB = fmul <8 x double> %A, %B
@@ -157,9 +157,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl
; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2
; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3
; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1
+; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
-; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_ps512:
@@ -178,9 +178,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl
; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2
; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3
; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1
+; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
-; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; FMA4-NEXT: retq
entry:
%AB = fmul <16 x float> %A, %B
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index d68236e9d250e..eb06eb75a4d70 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"
; CHECK-LABEL: fn1
-; CHECK: addl {{.*#+}} 4-byte Folded Reload
-; CHECK: imull {{.*#+}} 4-byte Folded Reload
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
+; CHECK: xorl {{.*#+}} 4-byte Folded Reload
+; CHECK: xorl {{.*#+}} 4-byte Folded Reload
; CHECK: retl
%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 98082ec611d49..6c6bc8bdc1d13 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/gnu-seh-nolpads.ll b/test/CodeGen/X86/gnu-seh-nolpads.ll
new file mode 100644
index 0000000000000..311f4d522b1df
--- /dev/null
+++ b/test/CodeGen/X86/gnu-seh-nolpads.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-windows-gnu < %s | FileCheck %s
+
+declare void @throwit()
+declare void @__gxx_personality_seh0(...)
+declare void @__gcc_personality_seh0(...)
+
+define void @use_gxx_seh()
+ personality void (...)* @__gxx_personality_seh0 {
+entry:
+ call void @throwit()
+ unreachable
+}
+
+; CHECK-LABEL: use_gxx_seh:
+; CHECK: .seh_proc use_gxx_seh
+; CHECK-NOT: .seh_handler __gxx_personality_seh0
+; CHECK: callq throwit
+; CHECK: .seh_handlerdata
+; CHECK: .seh_endproc
+
+define void @use_gcc_seh()
+ personality void (...)* @__gcc_personality_seh0 {
+entry:
+ call void @throwit()
+ unreachable
+}
+
+; CHECK-LABEL: use_gcc_seh:
+; CHECK: .seh_proc use_gcc_seh
+; CHECK-NOT: .seh_handler __gcc_personality_seh0
+; CHECK: callq throwit
+; CHECK: .seh_handlerdata
+; CHECK: .seh_endproc
+
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index d0ba057fa009c..b05c4467d3098 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -379,7 +379,7 @@ liveins:
- { reg: '%esi' }
# CHECK: bb.0.entry:
# CHECK: %eax = MOV32ri 2200000
-# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %eax, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x)
# CHECK-NEXT: JMP_1 %bb.1.not_null
body: |
@@ -544,7 +544,7 @@ liveins:
- { reg: '%rsi' }
# CHECK: bb.0.entry:
# CHECK: %rbx = MOV64rr %rdx
-# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %rbx, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x)
body: |
bb.0.entry:
@@ -656,7 +656,7 @@ body: |
name: use_alternate_load_op
# CHECK-LABEL: name: use_alternate_load_op
# CHECK: bb.0.entry:
-# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _
+# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -689,7 +689,7 @@ body: |
name: imp_null_check_gep_load_with_use_dep
# CHECK-LABEL: name: imp_null_check_gep_load_with_use_dep
# CHECK: bb.0.entry:
-# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
+# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
# CHECK-NEXT: JMP_1 %bb.1.not_null
alignment: 4
tracksRegLiveness: true
@@ -721,7 +721,7 @@ name: imp_null_check_load_with_base_sep
# CHECK-LABEL: name: imp_null_check_load_with_base_sep
# CHECK: bb.0.entry:
# CHECK: %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
-# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags
+# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %esi, %rdi, 1, _, 0, _, implicit-def %eflags
# CHECK-NEXT: JMP_1 %bb.1.not_null
alignment: 4
tracksRegLiveness: true
@@ -752,7 +752,7 @@ body: |
name: inc_store
# CHECK-LABEL: name: inc_store
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %rsi
+# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %rsi
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -782,7 +782,7 @@ body: |
name: inc_store_plus_offset
# CHECK-LABEL: inc_store_plus_offset
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %rsi
+# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %rsi
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -813,7 +813,7 @@ name: inc_store_with_dep
# CHECK-LABEL: inc_store_with_dep
# CHECK: bb.0.entry:
# CHECK: %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
-# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi
+# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -972,7 +972,7 @@ body: |
name: inc_store_with_reused_base
# CHECK-LABEL: inc_store_with_reused_base
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi
+# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -1174,7 +1174,7 @@ body: |
name: inc_store_with_load_and_store
# CHECK-LABEL: inc_store_with_load_and_store
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags
+# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %esi, implicit-def %eflags
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
@@ -1205,7 +1205,7 @@ body: |
name: inc_store_and_load_no_alias
# CHECK-LABEL: inc_store_and_load_no_alias
# CHECK: bb.0.entry:
-# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
# CHECK-NEXT: JMP_1 %bb.1.not_null
# CHECK: bb.1.not_null
diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll
new file mode 100644
index 0000000000000..a9cf086dbd900
--- /dev/null
+++ b/test/CodeGen/X86/lrshrink.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call
+; to minimize live-range.
+
+define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
+entry:
+ br i1 %a, label %then, label %else
+
+then:
+ br label %else
+
+else:
+ %0 = phi i64 [ 4, %entry ], [ 10, %then ]
+ %r = phi i64 [ %r1, %entry ], [ %r2, %then ]
+ %s = phi i64 [ %s1, %entry ], [ %s2, %then ]
+ %t = phi i64 [ %t1, %entry ], [ %t2, %then ]
+; CHECK-LABEL: test:
+; CHECK: add
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+ %1 = tail call i32 @_Z3foov()
+ %2 = zext i32 %1 to i64
+ %3 = tail call i32 @_Z3foov()
+ %4 = zext i32 %3 to i64
+ %5 = tail call i32 @_Z3foov()
+ %6 = zext i32 %5 to i64
+ %7 = add nuw nsw i64 %0, %r
+ tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+ %8 = add nuw nsw i64 %2, %7
+ %9 = add nuw nsw i64 %4, %8
+ %10 = add nuw nsw i64 %6, %9
+ %11 = add nuw nsw i64 %s, %t
+ tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+ %12 = add nuw nsw i64 %10, %11
+ ret i64 %12
+}
+
+declare i32 @_Z3foov()
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!1, !2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug)
+!1 = !{i32 2, !"Dwarf Version", i32 4}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DIFile(filename: "a.c", directory: "./")
+!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0)
+!5 = !DILocalVariable(name: "x", scope: !4)
+!6 = !DILocation(line: 4, scope: !4)
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index d332b2f3169f0..af86df5100165 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: pmullw %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-8, %rax
@@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2-NEXT: pmullw %xmm4, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm6
-; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm7
-; SSE2-NEXT: pmullw %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-16, %rax
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index e62a1d04dad67..94bbe75702cb8 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -17,9 +17,9 @@
;
; TOPDOWN-LABEL: %for.body
; TOPDOWN: movl %{{.*}}, (
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
; TOPDOWN: movl %{{.*}}, 4(
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
; TOPDOWN: movl %{{.*}}, 8(
; TOPDOWN: movl %{{.*}}, 12(
; TOPDOWN-LABEL: %for.end
diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll
index 6d2465ddd3a87..e3e2737cf3e62 100644
--- a/test/CodeGen/X86/mul-constant-i16.ll
+++ b/test/CodeGen/X86/mul-constant-i16.ll
@@ -188,16 +188,13 @@ define i16 @test_mul_by_11(i16 %x) {
; X86-LABEL: test_mul_by_11:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: imull $11, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_11:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: leal (%rdi,%rax,2), %eax
+; X64-NEXT: imull $11, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 11
@@ -228,16 +225,13 @@ define i16 @test_mul_by_13(i16 %x) {
; X86-LABEL: test_mul_by_13:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: imull $13, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_13:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: leal (%rdi,%rax,4), %eax
+; X64-NEXT: imull $13, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 13
@@ -247,19 +241,14 @@ define i16 @test_mul_by_13(i16 %x) {
define i16 @test_mul_by_14(i16 %x) {
; X86-LABEL: test_mul_by_14:
; X86: # BB#0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %eax
-; X86-NEXT: leal (%ecx,%eax,4), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $14, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_14:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: leal (%rdi,%rax,4), %eax
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: imull $14, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 14
@@ -349,19 +338,14 @@ define i16 @test_mul_by_19(i16 %x) {
; X86-LABEL: test_mul_by_19:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: shll $2, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $19, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_19:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: shll $2, %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imull $19, %edi, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 19
ret i16 %mul
@@ -391,16 +375,13 @@ define i16 @test_mul_by_21(i16 %x) {
; X86-LABEL: test_mul_by_21:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: imull $21, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_21:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: leal (%rdi,%rax,4), %eax
+; X64-NEXT: imull $21, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 21
@@ -410,19 +391,14 @@ define i16 @test_mul_by_21(i16 %x) {
define i16 @test_mul_by_22(i16 %x) {
; X86-LABEL: test_mul_by_22:
; X86: # BB#0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,4), %eax
-; X86-NEXT: leal (%ecx,%eax,4), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $22, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_22:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: leal (%rdi,%rax,4), %eax
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: imull $22, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 22
@@ -433,19 +409,14 @@ define i16 @test_mul_by_23(i16 %x) {
; X86-LABEL: test_mul_by_23:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $23, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_23:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: shll $3, %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imull $23, %edi, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 23
ret i16 %mul
@@ -495,19 +466,14 @@ define i16 @test_mul_by_26(i16 %x) {
; X86-LABEL: test_mul_by_26:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $26, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_26:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imull $26, %edi, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 26
ret i16 %mul
@@ -536,19 +502,14 @@ define i16 @test_mul_by_27(i16 %x) {
define i16 @test_mul_by_28(i16 %x) {
; X86-LABEL: test_mul_by_28:
; X86: # BB#0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $28, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_28:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: imull $28, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 28
@@ -558,21 +519,14 @@ define i16 @test_mul_by_28(i16 %x) {
define i16 @test_mul_by_29(i16 %x) {
; X86-LABEL: test_mul_by_29:
; X86: # BB#0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $29, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_29:
; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: addl %edi, %eax
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: imull $29, %edi, %eax
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 29
@@ -583,22 +537,14 @@ define i16 @test_mul_by_30(i16 %x) {
; X86-LABEL: test_mul_by_30:
; X86: # BB#0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shll $5, %ecx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: imull $30, %eax, %eax
; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_30:
; X64: # BB#0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $5, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: subl %eax, %ecx
-; X64-NEXT: subl %ecx, %edi
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imull $30, %edi, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; X64-NEXT: retq
%mul = mul nsw i16 %x, 30
ret i16 %mul
@@ -641,30 +587,3 @@ define i16 @test_mul_by_32(i16 %x) {
%mul = mul nsw i16 %x, 32
ret i16 %mul
}
-
-; (x*9+42)*(x*5+2)
-define i16 @test_mul_spec(i16 %x) nounwind {
-; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal 42(%eax,%eax,8), %ecx
-; X86-NEXT: leal 2(%eax,%eax,4), %eax
-; X86-NEXT: imull %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-NEXT: retl
-;
-; X64-LABEL: test_mul_spec:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx
-; X64-NEXT: leal 2(%rdi,%rdi,4), %eax
-; X64-NEXT: imull %ecx, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-NEXT: retq
- %mul = mul nsw i16 %x, 9
- %add = add nsw i16 %mul, 42
- %mul2 = mul nsw i16 %x, 5
- %add2 = add nsw i16 %mul2, 2
- %mul3 = mul nsw i16 %add, %add2
- ret i16 %mul3
-}
diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll
index b1e9a929b7f26..76e46e1f1b09e 100644
--- a/test/CodeGen/X86/mul-constant-i32.ll
+++ b/test/CodeGen/X86/mul-constant-i32.ll
@@ -1,12 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
-; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
define i32 @test_mul_by_1(i32 %x) {
; X86-LABEL: test_mul_by_1:
@@ -14,40 +8,10 @@ define i32 @test_mul_by_1(i32 %x) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_1:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_1:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_1:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_1:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_1:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_1:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_1:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_1:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 1
ret i32 %mul
}
@@ -59,47 +23,11 @@ define i32 @test_mul_by_2(i32 %x) {
; X86-NEXT: addl %eax, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_2:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_2:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_2:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: addl %eax, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_2:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_2:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_2:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_2:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_2:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 2
ret i32 %mul
}
@@ -110,46 +38,11 @@ define i32 @test_mul_by_3(i32 %x) {
; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_3:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_3:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_3:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_3:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_3:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_3:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_3:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_3:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 3
ret i32 %mul
}
@@ -161,47 +54,11 @@ define i32 @test_mul_by_4(i32 %x) {
; X86-NEXT: shll $2, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_4:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_4:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_4:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: shll $2, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_4:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_4:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_4:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_4:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_4:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (,%rdi,4), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 4
ret i32 %mul
}
@@ -212,46 +69,11 @@ define i32 @test_mul_by_5(i32 %x) {
; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_5:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_5:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_5:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_5:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_5:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_5:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_5:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_5:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 5
ret i32 %mul
}
@@ -264,46 +86,12 @@ define i32 @test_mul_by_6(i32 %x) {
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_6:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_6:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_6:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_6:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_6:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_6:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_6:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_6:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: addl %edi, %edi
+; X64-NEXT: leal (%rdi,%rdi,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 6
ret i32 %mul
}
@@ -316,46 +104,12 @@ define i32 @test_mul_by_7(i32 %x) {
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_7:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_7:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_7:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_7:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_7:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_7:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_7:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_7:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 7
ret i32 %mul
}
@@ -367,47 +121,11 @@ define i32 @test_mul_by_8(i32 %x) {
; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_8:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_8:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_8:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: shll $3, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_8:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_8:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_8:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_8:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_8:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (,%rdi,8), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 8
ret i32 %mul
}
@@ -418,46 +136,11 @@ define i32 @test_mul_by_9(i32 %x) {
; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_9:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_9:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_9:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_9:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_9:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_9:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_9:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_9:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,8), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 9
ret i32 %mul
}
@@ -470,46 +153,12 @@ define i32 @test_mul_by_10(i32 %x) {
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_10:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_10:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_10:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_10:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_10:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_10:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_10:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_10:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: addl %edi, %edi
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 10
ret i32 %mul
}
@@ -517,49 +166,13 @@ define i32 @test_mul_by_10(i32 %x) {
define i32 @test_mul_by_11(i32 %x) {
; X86-LABEL: test_mul_by_11:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_11:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_11:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_11:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_11:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_11:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_11:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_11:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_11:
+; X64: # BB#0:
+; X64-NEXT: imull $11, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 11
ret i32 %mul
}
@@ -572,46 +185,12 @@ define i32 @test_mul_by_12(i32 %x) {
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_12:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_12:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_12:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_12:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_12:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_12:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_12:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_12:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: shll $2, %edi
+; X64-NEXT: leal (%rdi,%rdi,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 12
ret i32 %mul
}
@@ -619,49 +198,13 @@ define i32 @test_mul_by_12(i32 %x) {
define i32 @test_mul_by_13(i32 %x) {
; X86-LABEL: test_mul_by_13:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_13:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_13:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_13:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_13:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_13:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_13:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_13:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_13:
+; X64: # BB#0:
+; X64-NEXT: imull $13, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 13
ret i32 %mul
}
@@ -669,52 +212,13 @@ define i32 @test_mul_by_13(i32 %x) {
define i32 @test_mul_by_14(i32 %x) {
; X86-LABEL: test_mul_by_14:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %eax
-; X86-NEXT: leal (%ecx,%eax,4), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_14:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_14:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_14:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_14:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_14:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_14:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_14:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_14:
+; X64: # BB#0:
+; X64-NEXT: imull $14, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 14
ret i32 %mul
}
@@ -727,46 +231,12 @@ define i32 @test_mul_by_15(i32 %x) {
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_15:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_15:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_15:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_15:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_15:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_15:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_15:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_15:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: leal (%rax,%rax,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 15
ret i32 %mul
}
@@ -778,47 +248,11 @@ define i32 @test_mul_by_16(i32 %x) {
; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_16:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_16:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_16:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: shll $4, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_16:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
-; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_16:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_16:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00]
-; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_16:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00]
-; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_16:
+; X64: # BB#0:
+; X64-NEXT: shll $4, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 16
ret i32 %mul
}
@@ -832,49 +266,13 @@ define i32 @test_mul_by_17(i32 %x) {
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_17:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_17:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_17:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_17:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_17:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_17:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_17:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_17:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll $4, %eax
+; X64-NEXT: leal (%rax,%rdi), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 17
ret i32 %mul
}
@@ -887,46 +285,12 @@ define i32 @test_mul_by_18(i32 %x) {
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_18:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_18:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_18:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_18:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_18:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_18:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
-; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_18:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_18:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: addl %edi, %edi
+; X64-NEXT: leal (%rdi,%rdi,8), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 18
ret i32 %mul
}
@@ -934,54 +298,13 @@ define i32 @test_mul_by_18(i32 %x) {
define i32 @test_mul_by_19(i32 %x) {
; X86-LABEL: test_mul_by_19:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: shll $2, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_19:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50]
-; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_19:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: shll $2, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_19:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_19:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_19:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_19:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_19:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_19:
+; X64: # BB#0:
+; X64-NEXT: imull $19, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 19
ret i32 %mul
}
@@ -994,46 +317,12 @@ define i32 @test_mul_by_20(i32 %x) {
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_20:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_20:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_20:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_20:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_20:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_20:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_20:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_20:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: shll $2, %edi
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 20
ret i32 %mul
}
@@ -1041,49 +330,13 @@ define i32 @test_mul_by_20(i32 %x) {
define i32 @test_mul_by_21(i32 %x) {
; X86-LABEL: test_mul_by_21:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_21:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_21:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_21:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_21:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_21:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_21:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_21:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_21:
+; X64: # BB#0:
+; X64-NEXT: imull $21, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 21
ret i32 %mul
}
@@ -1091,52 +344,13 @@ define i32 @test_mul_by_21(i32 %x) {
define i32 @test_mul_by_22(i32 %x) {
; X86-LABEL: test_mul_by_22:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,4), %eax
-; X86-NEXT: leal (%ecx,%eax,4), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_22:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_22:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_22:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_22:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_22:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_22:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_22:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_22:
+; X64: # BB#0:
+; X64-NEXT: imull $22, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 22
ret i32 %mul
}
@@ -1144,54 +358,13 @@ define i32 @test_mul_by_22(i32 %x) {
define i32 @test_mul_by_23(i32 %x) {
; X86-LABEL: test_mul_by_23:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_23:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50]
-; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_23:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_23:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_23:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_23:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_23:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_23:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_23:
+; X64: # BB#0:
+; X64-NEXT: imull $23, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 23
ret i32 %mul
}
@@ -1204,46 +377,12 @@ define i32 @test_mul_by_24(i32 %x) {
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_24:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_24:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_24:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_24:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_24:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_24:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_24:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_24:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: shll $3, %edi
+; X64-NEXT: leal (%rdi,%rdi,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 24
ret i32 %mul
}
@@ -1256,46 +395,12 @@ define i32 @test_mul_by_25(i32 %x) {
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_25:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_25:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_25:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_25:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_25:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_25:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_25:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_25:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: leal (%rax,%rax,4), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 25
ret i32 %mul
}
@@ -1303,54 +408,13 @@ define i32 @test_mul_by_25(i32 %x) {
define i32 @test_mul_by_26(i32 %x) {
; X86-LABEL: test_mul_by_26:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_26:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_26:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_26:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_26:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_26:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_26:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_26:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_26:
+; X64: # BB#0:
+; X64-NEXT: imull $26, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 26
ret i32 %mul
}
@@ -1363,46 +427,12 @@ define i32 @test_mul_by_27(i32 %x) {
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_27:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_27:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_27:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_27:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_27:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_27:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_27:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_27:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal (%rdi,%rdi,8), %eax
+; X64-NEXT: leal (%rax,%rax,2), %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 27
ret i32 %mul
}
@@ -1410,52 +440,13 @@ define i32 @test_mul_by_27(i32 %x) {
define i32 @test_mul_by_28(i32 %x) {
; X86-LABEL: test_mul_by_28:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_28:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_28:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_28:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_28:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_28:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_28:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_28:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_28:
+; X64: # BB#0:
+; X64-NEXT: imull $28, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 28
ret i32 %mul
}
@@ -1463,55 +454,13 @@ define i32 @test_mul_by_28(i32 %x) {
define i32 @test_mul_by_29(i32 %x) {
; X86-LABEL: test_mul_by_29:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_29:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_29:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_29:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_29:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_29:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_29:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_29:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_29:
+; X64: # BB#0:
+; X64-NEXT: imull $29, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 29
ret i32 %mul
}
@@ -1519,58 +468,13 @@ define i32 @test_mul_by_29(i32 %x) {
define i32 @test_mul_by_30(i32 %x) {
; X86-LABEL: test_mul_by_30:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shll $5, %ecx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_30:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
-; X64-HSW-NEXT: movl %edi, %ecx # sched: [1:0.25]
-; X64-HSW-NEXT: subl %eax, %ecx # sched: [1:0.25]
-; X64-HSW-NEXT: subl %ecx, %edi # sched: [1:0.25]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_30:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: movl %edi, %ecx # sched: [1:0.17]
-; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %eax, %ecx # sched: [1:0.50]
-; X64-JAG-NEXT: subl %ecx, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_30:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_30:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_30:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_30:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_30:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_30:
+; X64: # BB#0:
+; X64-NEXT: imull $30, %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 30
ret i32 %mul
}
@@ -1584,46 +488,12 @@ define i32 @test_mul_by_31(i32 %x) {
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_31:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
-; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_31:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_31:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_31:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_31:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_31:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00]
-; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_31:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_31:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll $5, %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 31
ret i32 %mul
}
@@ -1635,124 +505,11 @@ define i32 @test_mul_by_32(i32 %x) {
; X86-NEXT: shll $5, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_32:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50]
-; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_32:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_32:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: shll $5, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_32:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
-; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_32:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_32:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00]
-; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_32:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00]
-; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_32:
+; X64: # BB#0:
+; X64-NEXT: shll $5, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%mul = mul nsw i32 %x, 32
ret i32 %mul
}
-
-; (x*9+42)*(x*5+2)
-define i32 @test_mul_spec(i32 %x) nounwind {
-; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal 42(%eax,%eax,8), %ecx
-; X86-NEXT: leal 2(%eax,%eax,4), %eax
-; X86-NEXT: imull %ecx, %eax
-; X86-NEXT: retl
-;
-; X64-HSW-LABEL: test_mul_spec:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
-; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25]
-; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_spec:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
-; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_spec:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx
-; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax
-; X86-NOOPT-NEXT: imull %ecx, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_spec:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
-; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25]
-; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_spec:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
-; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_spec:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
-; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
-; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_spec:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
-; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
- %mul = mul nsw i32 %x, 9
- %add = add nsw i32 %mul, 42
- %mul2 = mul nsw i32 %x, 5
- %add2 = add nsw i32 %mul2, 2
- %mul3 = mul nsw i32 %add, %add2
- ret i32 %mul3
-}
diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll
index 22eb0bdc6c3f8..8579179a82315 100644
--- a/test/CodeGen/X86/mul-constant-i64.ll
+++ b/test/CodeGen/X86/mul-constant-i64.ll
@@ -1,55 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
-; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
-; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
-define i64 @test_mul_by_1(i64 %x) nounwind {
+define i64 @test_mul_by_1(i64 %x) {
; X86-LABEL: test_mul_by_1:
; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_1:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_1:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_1:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_1:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_1:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_1:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_1:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_1:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 1
ret i64 %mul
}
@@ -63,43 +26,10 @@ define i64 @test_mul_by_2(i64 %x) {
; X86-NEXT: addl %eax, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_2:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_2:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_2:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: shldl $1, %eax, %edx
-; X86-NOOPT-NEXT: addl %eax, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_2:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_2:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_2:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_2:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_2:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 2
ret i64 %mul
}
@@ -113,43 +43,10 @@ define i64 @test_mul_by_3(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_3:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_3:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_3:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $3, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_3:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_3:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_3:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_3:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_3:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 3
ret i64 %mul
}
@@ -163,43 +60,10 @@ define i64 @test_mul_by_4(i64 %x) {
; X86-NEXT: shll $2, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_4:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_4:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_4:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: shldl $2, %eax, %edx
-; X86-NOOPT-NEXT: shll $2, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_4:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_4:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_4:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_4:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_4:
+; X64: # BB#0:
+; X64-NEXT: leaq (,%rdi,4), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 4
ret i64 %mul
}
@@ -213,43 +77,10 @@ define i64 @test_mul_by_5(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_5:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_5:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_5:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $5, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_5:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_5:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_5:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_5:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_5:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,4), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 5
ret i64 %mul
}
@@ -264,46 +95,11 @@ define i64 @test_mul_by_6(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,2), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_6:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_6:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_6:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $6, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_6:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_6:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_6:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_6:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_6:
+; X64: # BB#0:
+; X64-NEXT: addq %rdi, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 6
ret i64 %mul
}
@@ -319,46 +115,11 @@ define i64 @test_mul_by_7(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_7:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_7:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_7:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $7, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_7:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_7:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_7:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_7:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_7:
+; X64: # BB#0:
+; X64-NEXT: leaq (,%rdi,8), %rax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 7
ret i64 %mul
}
@@ -372,43 +133,10 @@ define i64 @test_mul_by_8(i64 %x) {
; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_8:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_8:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_8:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: shldl $3, %eax, %edx
-; X86-NOOPT-NEXT: shll $3, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_8:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_8:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_8:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_8:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_8:
+; X64: # BB#0:
+; X64-NEXT: leaq (,%rdi,8), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 8
ret i64 %mul
}
@@ -422,43 +150,10 @@ define i64 @test_mul_by_9(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_9:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_9:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_9:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $9, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_9:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_9:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_9:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_9:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_9:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,8), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 9
ret i64 %mul
}
@@ -473,46 +168,11 @@ define i64 @test_mul_by_10(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,2), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_10:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_10:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_10:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $10, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_10:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_10:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_10:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_10:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_10:
+; X64: # BB#0:
+; X64-NEXT: addq %rdi, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,4), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 10
ret i64 %mul
}
@@ -520,53 +180,16 @@ define i64 @test_mul_by_10(i64 %x) {
define i64 @test_mul_by_11(i64 %x) {
; X86-LABEL: test_mul_by_11:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,2), %ecx
; X86-NEXT: movl $11, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_11:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_11:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_11:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $11, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_11:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_11:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_11:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_11:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_11:
+; X64: # BB#0:
+; X64-NEXT: imulq $11, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 11
ret i64 %mul
}
@@ -581,46 +204,11 @@ define i64 @test_mul_by_12(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,4), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_12:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_12:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_12:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $12, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_12:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_12:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_12:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_12:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_12:
+; X64: # BB#0:
+; X64-NEXT: shlq $2, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 12
ret i64 %mul
}
@@ -628,53 +216,16 @@ define i64 @test_mul_by_12(i64 %x) {
define i64 @test_mul_by_13(i64 %x) {
; X86-LABEL: test_mul_by_13:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
; X86-NEXT: movl $13, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_13:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_13:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_13:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $13, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_13:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_13:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_13:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_13:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_13:
+; X64: # BB#0:
+; X64-NEXT: imulq $13, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 13
ret i64 %mul
}
@@ -682,56 +233,16 @@ define i64 @test_mul_by_13(i64 %x) {
define i64 @test_mul_by_14(i64 %x) {
; X86-LABEL: test_mul_by_14:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,2), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
-; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $14, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_14:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_14:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_14:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $14, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_14:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_14:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_14:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_14:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_14:
+; X64: # BB#0:
+; X64-NEXT: imulq $14, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 14
ret i64 %mul
}
@@ -747,46 +258,11 @@ define i64 @test_mul_by_15(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_15:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_15:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_15:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $15, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_15:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_15:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_15:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_15:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_15:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,4), %rax
+; X64-NEXT: leaq (%rax,%rax,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 15
ret i64 %mul
}
@@ -800,49 +276,11 @@ define i64 @test_mul_by_16(i64 %x) {
; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_16:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_16:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_16:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: shldl $4, %eax, %edx
-; X86-NOOPT-NEXT: shll $4, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_16:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_16:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_16:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shlq $4, %rdi # sched: [1:1.00]
-; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_16:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: shlq $4, %rdi # sched: [1:1.00]
-; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_16:
+; X64: # BB#0:
+; X64-NEXT: shlq $4, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 16
ret i64 %mul
}
@@ -859,49 +297,12 @@ define i64 @test_mul_by_17(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_17:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_17:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_17:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $17, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_17:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_17:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_17:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: shlq $4, %rax # sched: [1:1.00]
-; X64-SLM-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_17:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_17:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shlq $4, %rax
+; X64-NEXT: leaq (%rax,%rdi), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 17
ret i64 %mul
}
@@ -916,46 +317,11 @@ define i64 @test_mul_by_18(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,2), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_18:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_18:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_18:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $18, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_18:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_18:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_18:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_18:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_18:
+; X64: # BB#0:
+; X64-NEXT: addq %rdi, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,8), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 18
ret i64 %mul
}
@@ -963,58 +329,16 @@ define i64 @test_mul_by_18(i64 %x) {
define i64 @test_mul_by_19(i64 %x) {
; X86-LABEL: test_mul_by_19:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,4), %eax
-; X86-NEXT: shll $2, %eax
-; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl $19, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_19:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50]
-; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_19:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: shlq $2, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_19:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $19, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_19:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_19:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_19:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_19:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_19:
+; X64: # BB#0:
+; X64-NEXT: imulq $19, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 19
ret i64 %mul
}
@@ -1029,46 +353,11 @@ define i64 @test_mul_by_20(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,4), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_20:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_20:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_20:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $20, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_20:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_20:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_20:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_20:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_20:
+; X64: # BB#0:
+; X64-NEXT: shlq $2, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,4), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 20
ret i64 %mul
}
@@ -1076,53 +365,16 @@ define i64 @test_mul_by_20(i64 %x) {
define i64 @test_mul_by_21(i64 %x) {
; X86-LABEL: test_mul_by_21:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
; X86-NEXT: movl $21, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_21:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_21:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_21:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $21, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_21:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_21:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_21:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_21:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_21:
+; X64: # BB#0:
+; X64-NEXT: imulq $21, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 21
ret i64 %mul
}
@@ -1130,56 +382,16 @@ define i64 @test_mul_by_21(i64 %x) {
define i64 @test_mul_by_22(i64 %x) {
; X86-LABEL: test_mul_by_22:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,4), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
-; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $22, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_22:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_22:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_22:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $22, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_22:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_22:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_22:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_22:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_22:
+; X64: # BB#0:
+; X64-NEXT: imulq $22, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 22
ret i64 %mul
}
@@ -1187,58 +399,16 @@ define i64 @test_mul_by_22(i64 %x) {
define i64 @test_mul_by_23(i64 %x) {
; X86-LABEL: test_mul_by_23:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %eax
-; X86-NEXT: shll $3, %eax
-; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl $23, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_23:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50]
-; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_23:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: shlq $3, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_23:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $23, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_23:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_23:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_23:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_23:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_23:
+; X64: # BB#0:
+; X64-NEXT: imulq $23, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 23
ret i64 %mul
}
@@ -1253,46 +423,11 @@ define i64 @test_mul_by_24(i64 %x) {
; X86-NEXT: leal (%edx,%ecx,8), %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_24:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_24:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shlq $3, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_24:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $24, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_24:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_24:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_24:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shlq $3, %rdi # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_24:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_24:
+; X64: # BB#0:
+; X64-NEXT: shlq $3, %rdi
+; X64-NEXT: leaq (%rdi,%rdi,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 24
ret i64 %mul
}
@@ -1308,46 +443,11 @@ define i64 @test_mul_by_25(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_25:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_25:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_25:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $25, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_25:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_25:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_25:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_25:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_25:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,4), %rax
+; X64-NEXT: leaq (%rax,%rax,4), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 25
ret i64 %mul
}
@@ -1355,58 +455,16 @@ define i64 @test_mul_by_25(i64 %x) {
define i64 @test_mul_by_26(i64 %x) {
; X86-LABEL: test_mul_by_26:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl $26, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_26:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_26:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_26:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $26, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_26:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_26:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_26:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_26:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_26:
+; X64: # BB#0:
+; X64-NEXT: imulq $26, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 26
ret i64 %mul
}
@@ -1422,46 +480,11 @@ define i64 @test_mul_by_27(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_27:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_27:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_27:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $27, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_27:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_27:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_27:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_27:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_27:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rdi,%rdi,8), %rax
+; X64-NEXT: leaq (%rax,%rax,2), %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 27
ret i64 %mul
}
@@ -1469,56 +492,16 @@ define i64 @test_mul_by_27(i64 %x) {
define i64 @test_mul_by_28(i64 %x) {
; X86-LABEL: test_mul_by_28:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $28, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_28:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_28:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_28:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $28, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_28:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_28:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_28:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_28:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_28:
+; X64: # BB#0:
+; X64-NEXT: imulq $28, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 28
ret i64 %mul
}
@@ -1526,59 +509,16 @@ define i64 @test_mul_by_28(i64 %x) {
define i64 @test_mul_by_29(i64 %x) {
; X86-LABEL: test_mul_by_29:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl $29, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_29:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_29:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_29:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $29, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_29:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_29:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_29:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_29:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_29:
+; X64: # BB#0:
+; X64-NEXT: imulq $29, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 29
ret i64 %mul
}
@@ -1586,60 +526,16 @@ define i64 @test_mul_by_29(i64 %x) {
define i64 @test_mul_by_30(i64 %x) {
; X86-LABEL: test_mul_by_30:
; X86: # BB#0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll $5, %ecx
; X86-NEXT: movl $30, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_30:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
-; X64-HSW-NEXT: movq %rdi, %rcx # sched: [1:0.25]
-; X64-HSW-NEXT: subq %rax, %rcx # sched: [1:0.25]
-; X64-HSW-NEXT: subq %rcx, %rdi # sched: [1:0.25]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_30:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: movq %rdi, %rcx # sched: [1:0.17]
-; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rax, %rcx # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rcx, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_30:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $30, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_30:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_30:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_30:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_30:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_30:
+; X64: # BB#0:
+; X64-NEXT: imulq $30, %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 30
ret i64 %mul
}
@@ -1656,49 +552,12 @@ define i64 @test_mul_by_31(i64 %x) {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_31:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
-; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_31:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_31:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl $31, %eax
-; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_31:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_31:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_31:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: shlq $5, %rax # sched: [1:1.00]
-; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_31:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_31:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shlq $5, %rax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 31
ret i64 %mul
}
@@ -1712,168 +571,11 @@ define i64 @test_mul_by_32(i64 %x) {
; X86-NEXT: shll $5, %eax
; X86-NEXT: retl
;
-; X64-HSW-LABEL: test_mul_by_32:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_by_32:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_by_32:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOOPT-NEXT: shldl $5, %eax, %edx
-; X86-NOOPT-NEXT: shll $5, %eax
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_by_32:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_by_32:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_by_32:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: shlq $5, %rdi # sched: [1:1.00]
-; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_by_32:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: shlq $5, %rdi # sched: [1:1.00]
-; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+; X64-LABEL: test_mul_by_32:
+; X64: # BB#0:
+; X64-NEXT: shlq $5, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%mul = mul nsw i64 %x, 32
ret i64 %mul
}
-
-; (x*9+42)*(x*5+2)
-define i64 @test_mul_spec(i64 %x) nounwind {
-; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl $9, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: leal (%edi,%edi,8), %ebx
-; X86-NEXT: addl $42, %esi
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: movl $5, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: leal (%edi,%edi,4), %edi
-; X86-NEXT: addl $2, %ecx
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: imull %esi, %edi
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: imull %ebx, %ecx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: retl
-;
-; X64-HSW-LABEL: test_mul_spec:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
-; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25]
-; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
-;
-; X64-JAG-LABEL: test_mul_spec:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
-; X64-JAG-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-JAG-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-JAG-NEXT: retq # sched: [4:1.00]
-;
-; X86-NOOPT-LABEL: test_mul_spec:
-; X86-NOOPT: # BB#0:
-; X86-NOOPT-NEXT: pushl %ebx
-; X86-NOOPT-NEXT: pushl %edi
-; X86-NOOPT-NEXT: pushl %esi
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NOOPT-NEXT: movl $9, %edx
-; X86-NOOPT-NEXT: movl %ecx, %eax
-; X86-NOOPT-NEXT: mull %edx
-; X86-NOOPT-NEXT: movl %eax, %esi
-; X86-NOOPT-NEXT: leal (%edi,%edi,8), %ebx
-; X86-NOOPT-NEXT: addl $42, %esi
-; X86-NOOPT-NEXT: adcl %edx, %ebx
-; X86-NOOPT-NEXT: movl $5, %edx
-; X86-NOOPT-NEXT: movl %ecx, %eax
-; X86-NOOPT-NEXT: mull %edx
-; X86-NOOPT-NEXT: movl %eax, %ecx
-; X86-NOOPT-NEXT: leal (%edi,%edi,4), %edi
-; X86-NOOPT-NEXT: addl $2, %ecx
-; X86-NOOPT-NEXT: adcl %edx, %edi
-; X86-NOOPT-NEXT: movl %esi, %eax
-; X86-NOOPT-NEXT: mull %ecx
-; X86-NOOPT-NEXT: imull %esi, %edi
-; X86-NOOPT-NEXT: addl %edi, %edx
-; X86-NOOPT-NEXT: imull %ebx, %ecx
-; X86-NOOPT-NEXT: addl %ecx, %edx
-; X86-NOOPT-NEXT: popl %esi
-; X86-NOOPT-NEXT: popl %edi
-; X86-NOOPT-NEXT: popl %ebx
-; X86-NOOPT-NEXT: retl
-;
-; HSW-NOOPT-LABEL: test_mul_spec:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
-; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25]
-; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
-;
-; JAG-NOOPT-LABEL: test_mul_spec:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
-; JAG-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
-; JAG-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
-;
-; X64-SLM-LABEL: test_mul_spec:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
-; X64-SLM-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
-; X64-SLM-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-SLM-NEXT: retq # sched: [4:1.00]
-;
-; SLM-NOOPT-LABEL: test_mul_spec:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
-; SLM-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
-; SLM-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
- %mul = mul nsw i64 %x, 9
- %add = add nsw i64 %mul, 42
- %mul2 = mul nsw i64 %x, 5
- %add2 = add nsw i64 %mul2, 2
- %mul3 = mul nsw i64 %add, %add2
- ret i64 %mul3
-}
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index d26cf02dd9424..0bda41a30c697 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2-LABEL: interleave_24i8_in:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE42: # BB#0:
; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
-; SSE42-NEXT: movdqa %xmm2, %xmm3
+; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
-; SSE42-NEXT: por %xmm1, %xmm3
+; SSE42-NEXT: por %xmm2, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: por %xmm0, %xmm2
-; SSE42-NEXT: movq %xmm2, 16(%rdi)
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: por %xmm0, %xmm1
+; SSE42-NEXT: movq %xmm1, 16(%rdi)
; SSE42-NEXT: movdqu %xmm3, (%rdi)
; SSE42-NEXT: retq
;
@@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; AVX: # BB#0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
-; AVX-NEXT: vmovdqu %xmm1, (%rdi)
+; AVX-NEXT: vmovdqu %xmm2, (%rdi)
; AVX-NEXT: retq
%s1 = load <8 x i8>, <8 x i8>* %q1, align 4
%s2 = load <8 x i8>, <8 x i8>* %q2, align 4
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 88cb7a6d58258..50a661fcca114 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pmuludq %xmm3, %xmm0
-; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; SSE41-NEXT: retq
;
@@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pmuludq %xmm7, %xmm5
+; SSE2-NEXT: pmuludq %xmm7, %xmm4
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pmuludq %xmm8, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE2-NEXT: pmuludq %xmm0, %xmm5
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
@@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE41-LABEL: mul_v8i64_zero_upper:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmuludq %xmm4, %xmm1
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: pmuludq %xmm5, %xmm0
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pmuludq %xmm6, %xmm2
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
; SSE41-NEXT: pmuludq %xmm7, %xmm1
-; SSE41-NEXT: pmuludq %xmm6, %xmm2
-; SSE41-NEXT: pmuludq %xmm5, %xmm0
-; SSE41-NEXT: pmuludq %xmm8, %xmm4
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; SSE41-NEXT: retq
;
@@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
@@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE41-LABEL: mul_v8i64_sext:
; SSE41: # BB#0:
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm8
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm7
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm5
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pmovsxwq %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT: pmuldq %xmm4, %xmm3
; SSE41-NEXT: pmovsxdq %xmm2, %xmm2
+; SSE41-NEXT: pmuldq %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT: pmuldq %xmm6, %xmm4
; SSE41-NEXT: pmovsxdq %xmm1, %xmm0
-; SSE41-NEXT: pmuldq %xmm5, %xmm0
-; SSE41-NEXT: pmuldq %xmm7, %xmm4
-; SSE41-NEXT: pmuldq %xmm6, %xmm2
-; SSE41-NEXT: pmuldq %xmm8, %xmm3
+; SSE41-NEXT: pmuldq %xmm7, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: retq
;
@@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_sext:
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 571dd6774906a..c54909cf93c19 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -1,81 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X86-O0
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X64-O0
+; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686
+; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686
+; REQUIRES: asserts
@c = external constant i8, align 1
define void @foo() {
-; X86-LABEL: foo:
-; X86: # BB#0: # %entry
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: .Lcfi0:
-; X86-NEXT: .cfi_def_cfa_offset 12
-; X86-NEXT: movzbl c, %eax
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: setne %cl
-; X86-NEXT: testb %al, %al
-; X86-NEXT: setne {{[0-9]+}}(%esp)
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpl %eax, %ecx
-; X86-NEXT: setle %dl
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: retl
-;
-; X86-O0-LABEL: foo:
-; X86-O0: # BB#0: # %entry
-; X86-O0-NEXT: subl $12, %esp
-; X86-O0-NEXT: .Lcfi0:
-; X86-O0-NEXT: .cfi_def_cfa_offset 16
-; X86-O0-NEXT: movb c, %al
-; X86-O0-NEXT: testb %al, %al
-; X86-O0-NEXT: setne {{[0-9]+}}(%esp)
-; X86-O0-NEXT: movzbl c, %ecx
-; X86-O0-NEXT: testl %ecx, %ecx
-; X86-O0-NEXT: setne %al
-; X86-O0-NEXT: movzbl %al, %edx
-; X86-O0-NEXT: subl %ecx, %edx
-; X86-O0-NEXT: setle %al
-; X86-O0-NEXT: andb $1, %al
-; X86-O0-NEXT: movzbl %al, %ecx
-; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-O0-NEXT: addl $12, %esp
-; X86-O0-NEXT: retl
-;
-; X64-LABEL: foo:
-; X64: # BB#0: # %entry
-; X64-NEXT: movzbl {{.*}}(%rip), %eax
-; X64-NEXT: testb %al, %al
-; X64-NEXT: setne -{{[0-9]+}}(%rsp)
-; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %cl
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpl %eax, %ecx
-; X64-NEXT: setle %dl
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: retq
-;
-; X64-O0-LABEL: foo:
-; X64-O0: # BB#0: # %entry
-; X64-O0-NEXT: movb {{.*}}(%rip), %al
-; X64-O0-NEXT: testb %al, %al
-; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx
-; X64-O0-NEXT: testl %ecx, %ecx
-; X64-O0-NEXT: setne %al
-; X64-O0-NEXT: movzbl %al, %edx
-; X64-O0-NEXT: subl %ecx, %edx
-; X64-O0-NEXT: setle %al
-; X64-O0-NEXT: andb $1, %al
-; X64-O0-NEXT: movzbl %al, %ecx
-; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-O0-NEXT: retq
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-DAG: setne
+; CHECK-DAG: setle
+; CHECK: ret
entry:
%a = alloca i8, align 1
%b = alloca i32, align 4
@@ -100,3 +36,125 @@ entry:
store i32 %conv8, i32* %b, align 4
ret void
}
+
+@var_5 = external global i32, align 4
+@var_57 = external global i64, align 8
+@_ZN8struct_210member_2_0E = external global i64, align 8
+
+define void @f1() {
+; CHECK-LABEL: f1:
+; CHECK: # BB#0: # %entry
+; CHECK: sete
+; X64: addq $7093, {{.*}}
+; 686: addl $7093, {{.*}}
+; CHECK: ret
+entry:
+ %a = alloca i8, align 1
+ %0 = load i32, i32* @var_5, align 4
+ %conv = sext i32 %0 to i64
+ %add = add nsw i64 %conv, 8381627093
+ %tobool = icmp ne i64 %add, 0
+ %frombool = zext i1 %tobool to i8
+ store i8 %frombool, i8* %a, align 1
+ %1 = load i32, i32* @var_5, align 4
+ %neg = xor i32 %1, -1
+ %tobool1 = icmp ne i32 %neg, 0
+ %lnot = xor i1 %tobool1, true
+ %conv2 = zext i1 %lnot to i64
+ %2 = load i32, i32* @var_5, align 4
+ %conv3 = sext i32 %2 to i64
+ %add4 = add nsw i64 %conv3, 7093
+ %cmp = icmp sgt i64 %conv2, %add4
+ %conv5 = zext i1 %cmp to i64
+ store i64 %conv5, i64* @var_57, align 8
+ %3 = load i32, i32* @var_5, align 4
+ %neg6 = xor i32 %3, -1
+ %tobool7 = icmp ne i32 %neg6, 0
+ %lnot8 = xor i1 %tobool7, true
+ %conv9 = zext i1 %lnot8 to i64
+ store i64 %conv9, i64* @_ZN8struct_210member_2_0E, align 8
+ ret void
+}
+
+
+@var_7 = external global i8, align 1
+
+define void @f2() {
+; CHECK-LABEL: f2:
+; CHECK: # BB#0: # %entry
+; X64: movzbl {{.*}}(%rip), %[[R:[a-z]*]]
+; 686: movzbl {{.*}}, %[[R:[a-z]*]]
+; CHECK: test{{[qlwb]}} %[[R]], %[[R]]
+; CHECK: sete {{.*}}
+; CHECK: ret
+entry:
+ %a = alloca i16, align 2
+ %0 = load i8, i8* @var_7, align 1
+ %conv = zext i8 %0 to i32
+ %1 = load i8, i8* @var_7, align 1
+ %tobool = icmp ne i8 %1, 0
+ %lnot = xor i1 %tobool, true
+ %conv1 = zext i1 %lnot to i32
+ %xor = xor i32 %conv, %conv1
+ %conv2 = trunc i32 %xor to i16
+ store i16 %conv2, i16* %a, align 2
+ %2 = load i8, i8* @var_7, align 1
+ %conv3 = zext i8 %2 to i16
+ %tobool4 = icmp ne i16 %conv3, 0
+ %lnot5 = xor i1 %tobool4, true
+ %conv6 = zext i1 %lnot5 to i32
+ %3 = load i8, i8* @var_7, align 1
+ %conv7 = zext i8 %3 to i32
+ %cmp = icmp eq i32 %conv6, %conv7
+ %conv8 = zext i1 %cmp to i32
+ %conv9 = trunc i32 %conv8 to i16
+ store i16 %conv9, i16* undef, align 2
+ ret void
+}
+
+
+@var_13 = external global i32, align 4
+@var_16 = external global i32, align 4
+@var_46 = external global i32, align 4
+
+define void @f3() #0 {
+; CHECK-LABEL: f3:
+; X64-DAG: movl var_13(%rip), {{.*}}
+; X64-DAG: movl var_16(%rip), {{.*}}
+; X64-DAG: movl {{.*}},{{.*}}var_46{{.*}}
+; X64: retq
+; 686-DAG: movl var_13, {{.*}}
+; 686-DAG: movl var_16, {{.*}}
+; 686-DAG: movl {{.*}},{{.*}}var_46{{.*}}
+; 686: retl
+entry:
+ %a = alloca i64, align 8
+ %0 = load i32, i32* @var_13, align 4
+ %neg = xor i32 %0, -1
+ %conv = zext i32 %neg to i64
+ %1 = load i32, i32* @var_13, align 4
+ %tobool = icmp ne i32 %1, 0
+ %lnot = xor i1 %tobool, true
+ %conv1 = zext i1 %lnot to i64
+ %2 = load i32, i32* @var_13, align 4
+ %neg2 = xor i32 %2, -1
+ %3 = load i32, i32* @var_16, align 4
+ %xor = xor i32 %neg2, %3
+ %conv3 = zext i32 %xor to i64
+ %and = and i64 %conv1, %conv3
+ %or = or i64 %conv, %and
+ store i64 %or, i64* %a, align 8
+ %4 = load i32, i32* @var_13, align 4
+ %neg4 = xor i32 %4, -1
+ %conv5 = zext i32 %neg4 to i64
+ %5 = load i32, i32* @var_13, align 4
+ %tobool6 = icmp ne i32 %5, 0
+ %lnot7 = xor i1 %tobool6, true
+ %conv8 = zext i1 %lnot7 to i64
+ %and9 = and i64 %conv8, 0
+ %or10 = or i64 %conv5, %and9
+ %conv11 = trunc i64 %or10 to i32
+ store i32 %conv11, i32* @var_46, align 4
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr32610.ll b/test/CodeGen/X86/pr32610.ll
new file mode 100644
index 0000000000000..1116cf6f1b29a
--- /dev/null
+++ b/test/CodeGen/X86/pr32610.ll
@@ -0,0 +1,40 @@
+; RUN: llc -o - %s | FileCheck %s
+
+; CHECK-LABEL: @pr32610
+; CHECK: movl L_b$non_lazy_ptr, [[BASEREG:%[a-z]+]]
+; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}}
+; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}}
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.13.0"
+
+@c = external local_unnamed_addr global i32, align 4
+@b = external local_unnamed_addr global [1 x i32], align 4
+@d = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: norecurse nounwind optsize ssp
+define void @pr32610() local_unnamed_addr #0 {
+entry:
+ %0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i32 0, i32 undef), align 4, !tbaa !1
+ %cmp = icmp eq i32 undef, %0
+ %conv = zext i1 %cmp to i32
+ %tobool1.i = icmp ne i32 undef, 0
+ %or.cond.i = and i1 %cmp, %tobool1.i
+ %cond.i = select i1 %or.cond.i, i32 %conv, i32 undef
+ store i32 %cond.i, i32* @c, align 4, !tbaa !1
+ %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4
+ %tobool = icmp ne i32 %1, 0
+ %2 = select i1 %tobool, i32 %1, i32 undef
+ store i32 %2, i32* @d, align 4, !tbaa !1
+ ret void
+}
+
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301507) (llvm/trunk 301505)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 5d5150ad62d60..4be3a4c2391b4 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB0_4:
-; 32-NEXT: orl %esi, %eax
; 32-NEXT: orl %ebx, %edx
+; 32-NEXT: orl %esi, %eax
; 32-NEXT: popl %esi
; 32-NEXT: popl %edi
; 32-NEXT: popl %ebx
@@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB1_4:
-; 32-NEXT: orl %ebx, %eax
; 32-NEXT: orl %esi, %edx
+; 32-NEXT: orl %ebx, %eax
; 32-NEXT: popl %esi
; 32-NEXT: popl %edi
; 32-NEXT: popl %ebx
@@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-LABEL: rotr1_64_mem:
; 32: # BB#0:
; 32-NEXT: pushl %esi
-; 32-NEXT: movl 8(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: movl (%eax), %ecx
; 32-NEXT: movl 4(%eax), %edx
; 32-NEXT: movl %edx, %esi
@@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-NEXT: movl %ecx, 4(%eax)
; 32-NEXT: movl %esi, (%eax)
; 32-NEXT: popl %esi
-
+; 32-NEXT: retl
+;
; 64-LABEL: rotr1_64_mem:
; 64: # BB#0:
; 64-NEXT: rorq (%rdi)
; 64-NEXT: retq
+
%A = load i64, i64 *%Aptr
%B = shl i64 %A, 63
%C = lshr i64 %A, 1
@@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
define void @rotr1_32_mem(i32* %Aptr) nounwind {
; 32-LABEL: rotr1_32_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorl (%eax)
; 32-NEXT: retl
;
@@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind {
define void @rotr1_16_mem(i16* %Aptr) nounwind {
; 32-LABEL: rotr1_16_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorw (%eax)
; 32-NEXT: retl
;
@@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind {
define void @rotr1_8_mem(i8* %Aptr) nounwind {
; 32-LABEL: rotr1_8_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorb (%eax)
; 32-NEXT: retl
;
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index b8a8b8afd14fd..6a565a5c76f0b 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -149,127 +149,131 @@ middle.block:
define i32 @sad_32i8() nounwind {
; SSE2-LABEL: sad_32i8:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm14, %xmm14
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm6
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm8
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm10, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm7
; SSE2-NEXT: movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm9, %xmm6
-; SSE2-NEXT: movdqa b+1024(%rax), %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm10, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm10, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm4, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm2, %xmm10
-; SSE2-NEXT: movdqa %xmm8, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm8
-; SSE2-NEXT: pxor %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm2, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm2, %xmm6
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm6, %xmm14
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm6, %xmm0
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm9, %xmm8
+; SSE2-NEXT: movdqa %xmm7, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm7
; SSE2-NEXT: paddd %xmm7, %xmm13
-; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm10, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm0, %xmm12
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm14
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # BB#2: # %middle.block
-; SSE2-NEXT: paddd %xmm15, %xmm3
-; SSE2-NEXT: paddd %xmm14, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm0
-; SSE2-NEXT: paddd %xmm13, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm15, %xmm6
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: paddd %xmm14, %xmm13
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: paddd %xmm13, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@@ -398,288 +402,284 @@ middle.block:
define i32 @sad_avx64i8() nounwind {
; SSE2-LABEL: sad_avx64i8:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: subq $184, %rsp
-; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: subq $200, %rsp
+; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm11, %xmm11
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm13, %xmm13
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm6
-; SSE2-NEXT: movdqa a+1024(%rax), %xmm4
-; SSE2-NEXT: movdqa a+1056(%rax), %xmm11
-; SSE2-NEXT: movdqa a+1072(%rax), %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm12
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm14
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; SSE2-NEXT: movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm13
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm6
-; SSE2-NEXT: movdqa b+1024(%rax), %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm10, %xmm8
-; SSE2-NEXT: movdqa %xmm13, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm13, %xmm14
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm9, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: movdqa b+1056(%rax), %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm10, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm10, %xmm12
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm2, %xmm11
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm10
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm13
-; SSE2-NEXT: movdqa b+1072(%rax), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm2, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps a+1040(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
+; SSE2-NEXT: movdqa a+1056(%rax), %xmm15
+; SSE2-NEXT: movdqa a+1072(%rax), %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm15, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm11, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm15, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm12, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm10
-; SSE2-NEXT: pxor %xmm0, %xmm10
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm13, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm13
-; SSE2-NEXT: pxor %xmm0, %xmm13
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm11, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm11
-; SSE2-NEXT: pxor %xmm0, %xmm11
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm12
-; SSE2-NEXT: pxor %xmm0, %xmm12
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm13
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; SSE2-NEXT: movdqa b+1072(%rax), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa b+1056(%rax), %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm7, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm8
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm11
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm9
+; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm9, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm12
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm13
+; SSE2-NEXT: movdqa %xmm13, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm3, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm13, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm13, %xmm2
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm11, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm11
+; SSE2-NEXT: pxor %xmm1, %xmm11
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm11
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm15, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: pxor %xmm1, %xmm15
+; SSE2-NEXT: paddd %xmm15, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm15
+; SSE2-NEXT: movdqa %xmm10, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm10
+; SSE2-NEXT: pxor %xmm1, %xmm10
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: paddd %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm9
+; SSE2-NEXT: paddd %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm0, %xmm7
-; SSE2-NEXT: movdqa %xmm14, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm14
-; SSE2-NEXT: pxor %xmm0, %xmm14
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm6, %xmm0
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm6
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm14, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm12, %xmm8
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm12
-; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: pxor %xmm0, %xmm7
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: paddd %xmm13, %xmm7
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm10, %xmm1
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm3
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # BB#2: # %middle.block
-; SSE2-NEXT: paddd %xmm2, %xmm4
-; SSE2-NEXT: paddd %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm2
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm13, %xmm14
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm2, %xmm15
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm13
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: paddd %xmm11, %xmm10
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: paddd %xmm13, %xmm1
+; SSE2-NEXT: paddd %xmm15, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm14, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm8, %xmm7
-; SSE2-NEXT: paddd %xmm6, %xmm7
-; SSE2-NEXT: paddd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
-; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: addq $184, %rsp
+; SSE2-NEXT: addq $200, %rsp
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_avx64i8:
@@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6
; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7
@@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8
+; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8
+; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
+; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
-; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15
-; AVX2-NEXT: vpabsd %ymm8, %ymm8
+; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15
+; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpabsd %ymm9, %ymm8
+; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm10, %ymm8
+; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpabsd %ymm11, %ymm8
; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3
-; AVX2-NEXT: vpabsd %ymm14, %ymm8
-; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT: vpabsd %ymm13, %ymm8
-; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
; AVX2-NEXT: vpabsd %ymm12, %ymm8
; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vpabsd %ymm11, %ymm8
-; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
-; AVX2-NEXT: vpabsd %ymm10, %ymm8
-; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
-; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm13, %ymm8
+; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpabsd %ymm14, %ymm8
+; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
; AVX2-NEXT: vpabsd %ymm15, %ymm8
-; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB2_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7
-; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6
-; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5
; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7
; AVX512F-NEXT: vpabsd %zmm4, %zmm4
-; AVX512F-NEXT: vpabsd %zmm5, %zmm5
-; AVX512F-NEXT: vpabsd %zmm6, %zmm6
-; AVX512F-NEXT: vpabsd %zmm7, %zmm7
-; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3
-; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2
-; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1
; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: vpabsd %zmm5, %zmm4
+; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vpabsd %zmm6, %zmm4
+; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vpabsd %zmm7, %zmm4
+; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB2_1
; AVX512F-NEXT: # BB#2: # %middle.block
@@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-LABEL: sad_nonloop_32i8:
; SSE2: # BB#0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu 16(%rdi), %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm12
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm13
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm13, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqu (%rdx), %xmm5
-; SSE2-NEXT: movdqu 16(%rdx), %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT: psubd %xmm5, %xmm0
-; SSE2-NEXT: psubd %xmm7, %xmm3
-; SSE2-NEXT: psubd %xmm2, %xmm13
-; SSE2-NEXT: psubd %xmm1, %xmm12
-; SSE2-NEXT: psubd %xmm8, %xmm6
-; SSE2-NEXT: psubd %xmm15, %xmm11
-; SSE2-NEXT: psubd %xmm14, %xmm10
-; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm1, %xmm9
+; SSE2-NEXT: movdqu 16(%rdi), %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm12, %xmm8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm12, %xmm13
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movdqu (%rdx), %xmm7
+; SSE2-NEXT: movdqu 16(%rdx), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm10
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm13
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm6, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm2, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm3, %xmm12
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm10
@@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm11
; SSE2-NEXT: pxor %xmm1, %xmm11
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm12
-; SSE2-NEXT: pxor %xmm1, %xmm12
; SSE2-NEXT: movdqa %xmm13, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm13
; SSE2-NEXT: pxor %xmm1, %xmm13
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm13, %xmm4
+; SSE2-NEXT: paddd %xmm10, %xmm4
+; SSE2-NEXT: paddd %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: paddd %xmm11, %xmm6
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm6
; SSE2-NEXT: paddd %xmm12, %xmm0
-; SSE2-NEXT: paddd %xmm6, %xmm0
-; SSE2-NEXT: paddd %xmm13, %xmm0
+; SSE2-NEXT: paddd %xmm8, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index ce42d0d643e8b..1afef86a5f11d 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: testb %dil, %dil
; GENERIC-NEXT: jne LBB7_4
; GENERIC-NEXT: ## BB#5:
+; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; GENERIC-NEXT: jmp LBB7_6
; GENERIC-NEXT: LBB7_4:
-; GENERIC-NEXT: movd %r9d, %xmm2
-; GENERIC-NEXT: movd %ecx, %xmm3
-; GENERIC-NEXT: movd %r8d, %xmm4
+; GENERIC-NEXT: movd %r9d, %xmm1
+; GENERIC-NEXT: movd %ecx, %xmm2
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; GENERIC-NEXT: movd %r8d, %xmm3
; GENERIC-NEXT: movd %edx, %xmm1
; GENERIC-NEXT: LBB7_6:
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0
; GENERIC-NEXT: movq %xmm0, 16(%rsi)
@@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; ATOM-NEXT: jmp LBB7_6
; ATOM-NEXT: LBB7_4:
-; ATOM-NEXT: movd %r9d, %xmm2
-; ATOM-NEXT: movd %ecx, %xmm3
-; ATOM-NEXT: movd %r8d, %xmm4
+; ATOM-NEXT: movd %r9d, %xmm1
+; ATOM-NEXT: movd %ecx, %xmm2
+; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; ATOM-NEXT: movd %r8d, %xmm3
; ATOM-NEXT: movd %edx, %xmm1
-; ATOM-NEXT: LBB7_6:
-; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; ATOM-NEXT: LBB7_6:
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1
; ATOM-NEXT: movq %xmm0, 16(%rsi)
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 1b8f8e7ae559c..2628f824ea407 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -45,64 +45,21 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; AVX-LABEL: pr26232:
; AVX: # BB#0: # %for_loop599.preheader
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB1_1: # %for_loop599
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000
; AVX-NEXT: setl %al
-; AVX-NEXT: vmovd %eax, %xmm2
-; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vpextrb $15, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $14, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $13, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $12, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $11, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $10, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $9, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $8, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $7, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $6, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $5, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $4, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $3, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $2, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $1, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vpextrb $0, %xmm2, %eax
-; AVX-NEXT: andb $1, %al
-; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: cmpw $0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovd %eax, %xmm3
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3
+; AVX-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
+; AVX-NEXT: vpmovmskb %xmm3, %eax
+; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: jne .LBB1_1
; AVX-NEXT: # BB#2: # %for_exit600
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 2996edaec3e0e..332bf2887fb05 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: ne_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r9
-; SSE2-NEXT: movq %xmm0, %r10
-; SSE2-NEXT: movq %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movq %xmm1, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rdi
+; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: xorq %r9, %rax
-; SSE2-NEXT: xorq %r8, %rdi
-; SSE2-NEXT: orq %rax, %rdi
+; SSE2-NEXT: movq %xmm0, %rsi
+; SSE2-NEXT: xorq %rcx, %rsi
+; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: xorq %rdx, %rax
+; SSE2-NEXT: movq %xmm3, %rcx
+; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdi
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: eq_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r9
-; SSE2-NEXT: movq %xmm0, %r10
-; SSE2-NEXT: movq %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movq %xmm1, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rdi
+; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: xorq %r9, %rax
-; SSE2-NEXT: xorq %r8, %rdi
-; SSE2-NEXT: orq %rax, %rdi
+; SSE2-NEXT: movq %xmm0, %rsi
+; SSE2-NEXT: xorq %rcx, %rsi
+; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: xorq %rdx, %rax
+; SSE2-NEXT: movq %xmm3, %rcx
+; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdi
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll
index c869dff9e6423..6701c247e6fc5 100644
--- a/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi
; CHECK-NEXT: movzbl (%edx,%ecx), %edx
; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx
+; CHECK-NEXT: imull %edi, %ebx
; CHECK-NEXT: movzbl (%eax,%ecx), %eax
; CHECK-NEXT: imull %edx, %eax
-; CHECK-NEXT: imull %edi, %ebx
; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4)
; CHECK-NEXT: movl %eax, (%esi,%ecx,4)
; CHECK-NEXT: popl %esi
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 503b9416c8d38..4a0dc9c1eb171 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X32: ## BB#0: ## %entry
; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X32-NEXT: addss %xmm1, %xmm0
; X32-NEXT: addss %xmm2, %xmm3
+; X32-NEXT: addss %xmm1, %xmm0
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
; X32-NEXT: retl
;
@@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X64: ## BB#0: ## %entry
; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-NEXT: addss %xmm1, %xmm0
; X64-NEXT: addss %xmm2, %xmm3
+; X64-NEXT: addss %xmm1, %xmm0
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
; X64-NEXT: retq
entry:
@@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: addps %xmm2, %xmm3
; X32-NEXT: addps %xmm3, %xmm0
; X32-NEXT: retl
@@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: addps %xmm2, %xmm3
; X64-NEXT: addps %xmm3, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index 226c0adbaf3c3..2fb821555dba5 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index a05a981daa1f0..f0a5fe1dbfffb 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_logic_v8i32:
@@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm3
; SSSE3-NEXT: pandn %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_logic_v8i32:
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index f4d0503f4a792..4181a374c61ce 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1
-; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX-NEXT: retq
%wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
%strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0
; AVX-NEXT: retq
%wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
%strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -120,9 +120,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll
index fddc7906e08f9..939fa0404223d 100644
--- a/test/CodeGen/X86/xchg-nofold.ll
+++ b/test/CodeGen/X86/xchg-nofold.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s
%"struct.std::atomic" = type { %"struct.std::atomic_bool" }
@@ -6,6 +7,28 @@
; CHECK-LABEL: _Z3fooRSt6atomicIbEb
define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind {
+; CHECK-LABEL: _Z3fooRSt6atomicIbEb:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq $3, %rax
+; CHECK-NEXT: movb 2147450880(%rax), %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB0_3
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andl $7, %ecx
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: jge .LBB0_2
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: xchgb %al, (%rdi)
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq __asan_report_store1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
entry:
%frombool.i.i = zext i1 %b to i8
%_M_i.i.i = getelementptr inbounds %"struct.std::atomic", %"struct.std::atomic"* %a, i64 0, i32 0, i32 0, i32 0
@@ -30,7 +53,6 @@ entry:
; <label>:11: ; preds = %6, %entry
store atomic i8 %frombool.i.i, i8* %_M_i.i.i seq_cst, align 1
-; CHECK: xchgb %{{.*}}, (%{{.*}})
ret i1 %b
}
diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir
index 1766a8f446160..71d10fe9de94c 100644
--- a/test/DebugInfo/MIR/X86/empty-inline.mir
+++ b/test/DebugInfo/MIR/X86/empty-inline.mir
@@ -73,7 +73,6 @@
name: _ZN1C5m_fn3Ev
alignment: 4
exposesReturnsTwice: false
-noVRegs: true
legalized: false
regBankSelected: false
selected: false
diff --git a/test/DebugInfo/omit-empty.ll b/test/DebugInfo/omit-empty.ll
index 92450050d2089..8b277676f94ca 100644
--- a/test/DebugInfo/omit-empty.ll
+++ b/test/DebugInfo/omit-empty.ll
@@ -1,4 +1,5 @@
; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s
+; REQUIRES: default_triple
; CHECK-NOT: .debug_
diff --git a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
index 092c9dc6b95be..f7f63bd6be807 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
@@ -14,8 +14,8 @@
; clang++ ../1.cc -O3 -g -S -emit-llvm -fno-strict-aliasing
; and add sanitize_address to @_ZN1A1fEv
-; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
-; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[A:!.*]]
+; Test that __sanitizer_cov_trace_pc_guard call has !dbg pointing to the opening { of A::f().
+; CHECK: call void @__sanitizer_cov_trace_pc_guard(i32*{{.*}}), !dbg [[A:!.*]]
; CHECK: [[A]] = !DILocation(line: 6, scope: !{{.*}})
diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
index d675c9d9c3709..7b6b5f00442fe 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll
@@ -1,16 +1,5 @@
-; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s --check-prefix=CHECK0
-; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s --check-prefix=CHECK1
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
-; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3
; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_TRACE_PC
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \
-; RUN: -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 \
-; RUN: -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=CHECKPRUNE
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Instrumentation/SanitizerCoverage/seh.ll b/test/Instrumentation/SanitizerCoverage/seh.ll
index ce18334ed2074..f432573af64a6 100644
--- a/test/Instrumentation/SanitizerCoverage/seh.ll
+++ b/test/Instrumentation/SanitizerCoverage/seh.ll
@@ -1,7 +1,6 @@
; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s
; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s
; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i686-pc-windows-msvc18.0.0"
diff --git a/test/MC/AMDGPU/ds-err.s b/test/MC/AMDGPU/ds-err.s
index 3951efbb60f98..d9f22f5f3ed20 100644
--- a/test/MC/AMDGPU/ds-err.s
+++ b/test/MC/AMDGPU/ds-err.s
@@ -21,3 +21,93 @@ ds_write2_b32 v2, v4, v6 offset0:1000000000
// CHECK: invalid operand for instruction
ds_write2_b32 v2, v4, v6 offset1:1000000000
+//===----------------------------------------------------------------------===//
+// swizzle
+//===----------------------------------------------------------------------===//
+
+// CHECK: error: expected a colon
+ds_swizzle_b32 v8, v2 offset
+
+// CHECK: error: failed parsing operand
+ds_swizzle_b32 v8, v2 offset:
+
+// CHECK: error: expected a colon
+ds_swizzle_b32 v8, v2 offset-
+
+// CHECK: error: expected absolute expression
+ds_swizzle_b32 v8, v2 offset:SWIZZLE(QUAD_PERM, 0, 1, 2, 3)
+
+// CHECK: error: expected a swizzle mode
+ds_swizzle_b32 v8, v2 offset:swizzle(quad_perm, 0, 1, 2, 3)
+
+// CHECK: error: expected a swizzle mode
+ds_swizzle_b32 v8, v2 offset:swizzle(XXX,1)
+
+// CHECK: error: expected a comma
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM
+
+// CHECK: error: expected a comma
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2)
+
+// CHECK: error: expected a closing parentheses
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3
+
+// CHECK: error: expected a closing parentheses
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3, 4)
+
+// CHECK: error: expected a 2-bit lane id
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, -1, 1, 2, 3)
+
+// CHECK: error: expected a 2-bit lane id
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 4, 1, 2, 3)
+
+// CHECK: error: group size must be in the interval [1,16]
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,0)
+
+// CHECK: error: group size must be a power of two
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,3)
+
+// CHECK: error: group size must be in the interval [1,16]
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,17)
+
+// CHECK: error: group size must be in the interval [1,16]
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,32)
+
+// CHECK: error: group size must be in the interval [2,32]
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,1)
+
+// CHECK: error: group size must be a power of two
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,3)
+
+// CHECK: error: group size must be in the interval [2,32]
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,33)
+
+// CHECK: error: group size must be in the interval [2,32]
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,1,0)
+
+// CHECK: error: group size must be a power of two
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,3,1)
+
+// CHECK: error: group size must be in the interval [2,32]
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,33,1)
+
+// CHECK: error: lane id must be in the interval [0,group size - 1]
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,-1)
+
+// CHECK: error: lane id must be in the interval [0,group size - 1]
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,2)
+
+// CHECK: error: expected a string
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, pppii)
+
+// CHECK: error: expected a 5-character mask
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "")
+
+// CHECK: error: expected a 5-character mask
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "ppii")
+
+// CHECK: error: expected a 5-character mask
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppiii")
+
+// CHECK: invalid mask
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppi2")
diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s
index 18e4957e32d75..ef36a98f746ac 100644
--- a/test/MC/AMDGPU/ds.s
+++ b/test/MC/AMDGPU/ds.s
@@ -267,10 +267,6 @@ ds_max_rtn_f32 v8, v2, v4
// SICI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x00,0x08]
// VI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x04,0x00,0x08]
-ds_swizzle_b32 v8, v2
-// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
-// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
-
ds_read_b32 v8, v2
// SICI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08]
// VI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08]
@@ -506,3 +502,143 @@ ds_nop
// NOSI: error: instruction not supported on this GPU
// CI: ds_nop ; encoding: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00]
// VI: ds_nop ; encoding: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
+// swizzle
+//===----------------------------------------------------------------------===//
+
+ds_swizzle_b32 v8, v2
+// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0xFFFF
+// SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 2, 1, 3, 3)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,2)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31)
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppii")
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip")
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x000
+// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x001
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x020
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x021
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x400
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x401
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x420
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
+
+ds_swizzle_b32 v8, v2 offset:0x421
+// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08]
+// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08]
diff --git a/test/MC/ARM/big-endian-thumb-fixup.s b/test/MC/ARM/big-endian-thumb-fixup.s
index 5023fca26be10..4e81469fe489b 100644
--- a/test/MC/ARM/big-endian-thumb-fixup.s
+++ b/test/MC/ARM/big-endian-thumb-fixup.s
@@ -4,6 +4,7 @@
.text
.align 2
.code 16
+ .thumb_func
@ARM::fixup_arm_thumb_bl
.section s_thumb_bl,"ax",%progbits
diff --git a/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll
new file mode 100644
index 0000000000000..155ce5a425b45
--- /dev/null
+++ b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll
@@ -0,0 +1,77 @@
+; RUN: llc -O0 < %s -mtriple armv7-linux-gnueabi -o - \
+; RUN: | llvm-mc -triple armv7-linux-gnueabi -filetype=obj -o - \
+; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s
+
+; RUN: llc -O0 < %s -mtriple armv7-linux-android -o - \
+; RUN: | llvm-mc -triple armv7-linux-android -filetype=obj -o - \
+; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s
+
+
+; RUN: llc -O0 < %s -mtriple armv7-apple-ios -o - \
+; RUN: | llvm-mc -triple armv7-apple-ios -filetype=obj -o - \
+; RUN: | llvm-readobj -r | FileCheck --check-prefix IOS %s
+
+
+define void @thumb_caller() #0 {
+ call void @internal_arm_fn()
+ call void @global_arm_fn()
+ call void @internal_thumb_fn()
+ call void @global_thumb_fn()
+ ret void
+}
+
+define void @arm_caller() #1 {
+ call void @internal_arm_fn()
+ call void @global_arm_fn()
+ call void @internal_thumb_fn()
+ call void @global_thumb_fn()
+ ret void
+}
+
+define internal void @internal_thumb_fn() #0 {
+ ret void
+}
+
+define void @global_thumb_fn() #0 {
+entry:
+ br label %end
+end:
+ br label %end
+ ret void
+}
+
+define internal void @internal_arm_fn() #1 {
+ ret void
+}
+
+define void @global_arm_fn() #1 {
+entry:
+ br label %end
+end:
+ br label %end
+ ret void
+}
+
+attributes #0 = { "target-features"="+thumb-mode" }
+attributes #1 = { "target-features"="-thumb-mode" }
+
+; LINUX: Section (3) .rel.text {
+; LINUX-NEXT: 0x2 R_ARM_THM_CALL internal_arm_fn 0x0
+; LINUX-NEXT: 0x6 R_ARM_THM_CALL global_arm_fn 0x0
+; LINUX-NEXT: 0xE R_ARM_THM_CALL global_thumb_fn 0x0
+; LINUX-NEXT: 0x1C R_ARM_CALL internal_arm_fn 0x0
+; LINUX-NEXT: 0x20 R_ARM_CALL global_arm_fn 0x0
+; LINUX-NEXT: 0x24 R_ARM_CALL internal_thumb_fn 0x0
+; LINUX-NEXT: 0x28 R_ARM_CALL global_thumb_fn 0x0
+; LINUX-NEXT: }
+
+; IOS: Section __text {
+; IOS-NEXT: 0x2C 1 2 0 ARM_RELOC_BR24 0 __text
+; IOS-NEXT: 0x28 1 2 0 ARM_RELOC_BR24 0 __text
+; IOS-NEXT: 0x24 1 2 0 ARM_RELOC_BR24 0 __text
+; IOS-NEXT: 0x20 1 2 0 ARM_RELOC_BR24 0 __text
+; IOS-NEXT: 0x10 1 2 0 ARM_THUMB_RELOC_BR22 0 __text
+; IOS-NEXT: 0xC 1 2 0 ARM_THUMB_RELOC_BR22 0 __text
+; IOS-NEXT: 0x8 1 2 0 ARM_THUMB_RELOC_BR22 0 __text
+; IOS-NEXT: 0x4 1 2 0 ARM_THUMB_RELOC_BR22 0 __text
+; IOS-NEXT: }
diff --git a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt
index 37725e960f927..5fe7a8cd06219 100644
--- a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt
+++ b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt
@@ -990,23 +990,23 @@
# CHECK: ds_read_u16 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05]
0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05
-# CHECK: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05]
-0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05
+# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05
-# CHECK: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff]
-0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff
+# CHECK: ds_swizzle_b32 v255, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff]
+0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff
-# CHECK: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05]
-0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05
+# CHECK: ds_swizzle_b32 v5, v255 ; encoding: [0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05
# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05
-# CHECK: ds_swizzle_b32 v5, v1 offset:4 ; encoding: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
-0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05
+# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05
-# CHECK: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05]
-0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05
+# CHECK: ds_swizzle_b32 v5, v1 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05
# CHECK: ds_permute_b32 v5, v1, v2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05]
0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index 0ec356392a2d4..c5d10a0a67e34 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -30,6 +30,8 @@
; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
; CHECK-O-NEXT: Starting llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
+; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
@@ -53,7 +55,6 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
-; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
; CHECK-O-NEXT: Running analysis: GlobalsAA
; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
@@ -134,6 +135,10 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Finished CGSCC pass manager run.
+; CHECK-O-NEXT: Finished llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: GlobalOptPass
; CHECK-O-NEXT: Running pass: EliminateAvailableExternallyPass
; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -163,6 +168,7 @@
; CHECK-O-NEXT: Running pass: GlobalDCEPass
; CHECK-O-NEXT: Running pass: ConstantMergePass
; CHECK-O-NEXT: Finished llvm::Module pass manager run.
+; CHECK-O-NEXT: Finished llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: PrintModulePass
;
; Make sure we get the IR back out without changes when we print the module.
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
new file mode 100644
index 0000000000000..52f475b0397d0
--- /dev/null
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -0,0 +1,221 @@
+; The IR below was crafted so as:
+; 1) To have a loop, so we create a loop pass manager
+; 2) To be "immutable" in the sense that no pass in the standard
+; pipeline will modify it.
+; Since no transformations take place, we don't expect any analyses
+; to be invalidated.
+; Any invalidation that shows up here is a bug, unless we started modifying
+; the IR, in which case we need to make it immutable harder.
+;
+; Prelink pipelines:
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto-pre-link<O1>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O1
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto-pre-link<O2>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto-pre-link<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-PRELINK-O,CHECK-PRELINK-O3
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto-pre-link<Os>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-PRELINK-O,CHECK-PRELINK-Os
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto-pre-link<Oz>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-PRELINK-O,CHECK-PRELINK-Oz
+;
+; Postlink pipelines:
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto<O1>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,CHECK-POSTLINK-O1
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto<O2>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-POSTLINK-O,CHECK-POSTLINK-O3
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto<Os>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-POSTLINK-O,CHECK-POSTLINK-Os
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='thinlto<Oz>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz
+;
+; CHECK-O: Starting llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: PGOIndirectCallPromotion
+; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
+; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
+; CHECK-O-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-O-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-O-NEXT: Running pass: SROA
+; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
+; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: IPSCCPPass
+; CHECK-O-NEXT: Running pass: GlobalOptPass
+; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PromotePass>
+; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
+; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
+; CHECK-O-NEXT: Running analysis: GlobalsAA
+; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis
+; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
+; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}>
+; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
+; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
+; CHECK-O-NEXT: Starting CGSCC pass manager run.
+; CHECK-O-NEXT: Running pass: InlinerPass
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
+; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
+; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
+; CHECK-O-NEXT: Running analysis: AAManager
+; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
+; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: SROA
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
+; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass
+; CHECK-O-NEXT: Running pass: JumpThreadingPass
+; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
+; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
+; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
+; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
+; CHECK-O-NEXT: Running pass: TailCallElimPass
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ReassociatePass
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}>
+; CHECK-O-NEXT: Running analysis: LoopAnalysis
+; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
+; CHECK-O-NEXT: Starting Loop pass manager run.
+; CHECK-O-NEXT: Running pass: LoopRotatePass
+; CHECK-O-NEXT: Running pass: LICM
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
+; CHECK-O-NEXT: Finished Loop pass manager run.
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}>
+; CHECK-O-NEXT: Starting Loop pass manager run.
+; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
+; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass
+; CHECK-O-NEXT: Running pass: LoopDeletionPass
+; CHECK-O-NEXT: Running pass: LoopUnrollPass
+; CHECK-O-NEXT: Finished Loop pass manager run.
+; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-Os-NEXT: Running pass: GVN
+; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-Oz-NEXT: Running pass: GVN
+; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-O2-NEXT: Running pass: GVN
+; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-O3-NEXT: Running pass: GVN
+; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O-NEXT: Running pass: MemCpyOptPass
+; CHECK-O1-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O-NEXT: Running pass: SCCPPass
+; CHECK-O-NEXT: Running pass: BDCEPass
+; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: JumpThreadingPass
+; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O-NEXT: Running pass: DSEPass
+; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-O-NEXT: Running pass: ADCEPass
+; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O-NEXT: Finished CGSCC pass manager run.
+; CHECK-O-NEXT: Finished llvm::Module pass manager run.
+; CHECK-PRELINK-O-NEXT: Running pass: GlobalOptPass
+; CHECK-PRELINK-O-NEXT: Running pass: NameAnonGlobalPass
+; CHECK-POSTLINK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
+; CHECK-POSTLINK-O-NEXT: Starting llvm::Module pass manager run.
+; CHECK-POSTLINK-O-NEXT: Running pass: GlobalOptPass
+; CHECK-POSTLINK-O-NEXT: Running pass: EliminateAvailableExternallyPass
+; CHECK-POSTLINK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
+; CHECK-POSTLINK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run.
+; CHECK-POSTLINK-O-NEXT: Running pass: Float2IntPass
+; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
+; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
+; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
+; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass
+; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopUnrollPass
+; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
+; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass
+; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass
+; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass
+; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass
+; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass
+; CHECK-POSTLINK-O-NEXT: Finished llvm::Module pass manager run.
+; CHECK-O-NEXT: Finished llvm::Module pass manager run.
+; CHECK-O-NEXT: Running pass: PrintModulePass
+
+; Make sure we get the IR back out without changes when we print the module.
+; CHECK-O-LABEL: define void @foo(i32 %n) local_unnamed_addr {
+; CHECK-O-NEXT: entry:
+; CHECK-O-NEXT: br label %loop
+; CHECK-O: loop:
+; CHECK-O-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-O-NEXT: %iv.next = add i32 %iv, 1
+; CHECK-O-NEXT: tail call void @bar()
+; CHECK-O-NEXT: %cmp = icmp eq i32 %iv, %n
+; CHECK-O-NEXT: br i1 %cmp, label %exit, label %loop
+; CHECK-O: exit:
+; CHECK-O-NEXT: ret void
+; CHECK-O-NEXT: }
+;
+; CHECK-O-NEXT: Finished llvm::Module pass manager run.
+
+declare void @bar() local_unnamed_addr
+
+define void @foo(i32 %n) local_unnamed_addr {
+entry:
+ br label %loop
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %iv.next = add i32 %iv, 1
+ tail call void @bar()
+ %cmp = icmp eq i32 %iv, %n
+ br i1 %cmp, label %exit, label %loop
+exit:
+ ret void
+}
diff --git a/test/ThinLTO/X86/error-newpm.ll b/test/ThinLTO/X86/newpm-basic.ll
index 9c2fd2c70d6dd..d357cbc85d005 100644
--- a/test/ThinLTO/X86/error-newpm.ll
+++ b/test/ThinLTO/X86/newpm-basic.ll
@@ -1,9 +1,7 @@
; RUN: opt -module-summary %s -o %t1.bc
-; RUN: not llvm-lto2 run %t1.bc -o %t.o \
+; RUN: llvm-lto2 run %t1.bc -o %t.o \
; RUN: -r=%t1.bc,_tinkywinky,pxl \
-; RUN: -lto-use-new-pm 2>&1 | FileCheck %s
-
-; CHECK: ThinLTO not supported with the new PM yet!
+; RUN: -lto-use-new-pm
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll
new file mode 100644
index 0000000000000..48db0b61a31be
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll
@@ -0,0 +1,68 @@
+
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+ ; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+
+%"class.base" = type { %"struct.base"* }
+%"struct.base" = type opaque
+
+@g = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @callee_sinkable_bitcast(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL:define{{.*}}@callee_sinkable_bitcast.{{[0-9]}}
+; CHECK: alloca
+; CHECK-NEXT: bitcast
+; CHECK: call void @llvm.lifetime
+bb:
+ %tmp = alloca %"class.base", align 4
+ %tmp1 = bitcast %"class.base"* %tmp to i8*
+ %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+ %tmp3 = add nsw i32 %tmp2, 1
+ %tmp4 = icmp slt i32 %arg, 0
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2
+ %tmp11 = bitcast %"class.base"* %tmp to i32*
+ store i32 %tmp3, i32* %tmp11, align 4, !tbaa !2
+ store i32 %tmp3, i32* @g, align 4, !tbaa !2
+ call void @bar(i32* nonnull %tmp11) #2
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb
+ %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ]
+ ret i32 %tmp7
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @bar(i32*) local_unnamed_addr #2
+declare void @bar2(i32*, i32*) local_unnamed_addr #1
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 @callee_sinkable_bitcast(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind uwtable}
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll
new file mode 100644
index 0000000000000..4ca418389e5ef
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+
+%"class.base" = type { %"struct.base"* }
+%"struct.base" = type opaque
+
+@g = external local_unnamed_addr global i32, align 4
+
+define i32 @callee_no_bitcast(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL:define{{.*}}@callee_no_bitcast.{{[0-9]}}
+; CHECK: alloca
+; CHECK: call void @llvm.lifetime
+bb:
+ %tmp = alloca i8, align 4
+ %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+ %tmp3 = add nsw i32 %tmp2, 1
+ %tmp4 = icmp slt i32 %arg, 0
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2
+ store i32 %tmp3, i32* @g, align 4, !tbaa !2
+ %tmp11 = bitcast i8 * %tmp to i32*
+ call void @bar(i32* nonnull %tmp11) #2
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb
+ %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ]
+ ret i32 %tmp7
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @bar(i32*) local_unnamed_addr #2
+declare void @bar2(i32*, i32*) local_unnamed_addr #1
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 @callee_no_bitcast(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind uwtable}
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
+
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll
new file mode 100644
index 0000000000000..6bb38d44f466c
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+
+%"class.base" = type { %"struct.base"* }
+%"struct.base" = type opaque
+
+@g = external local_unnamed_addr global i32, align 4
+
+define i32 @callee_unknown_use1(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL:define{{.*}}@callee_unknown_use1.{{[0-9]}}
+; CHECK-NOT: alloca
+; CHECK: call void @llvm.lifetime
+bb:
+ %tmp = alloca i8, align 4
+ %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+ %tmp3 = add nsw i32 %tmp2, 1
+ %tmp4 = icmp slt i32 %arg, 0
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2
+ store i32 %tmp3, i32* @g, align 4, !tbaa !2
+ %tmp11 = bitcast i8* %tmp to i32*
+ call void @bar(i32* nonnull %tmp11) #2
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb
+ %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ]
+ %tmp1 = bitcast i8* %tmp to i32*
+ ret i32 %tmp7
+}
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @bar(i32*) local_unnamed_addr #2
+declare void @bar2(i32*, i32*) local_unnamed_addr #1
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 @callee_unknown_use1(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind uwtable}
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
+
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
new file mode 100644
index 0000000000000..9c53496e1ceac
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+
+%"class.base" = type { %"struct.base"* }
+%"struct.base" = type opaque
+
+@g = external local_unnamed_addr global i32, align 4
+
+define i32 @callee_unknown_use2(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL:define{{.*}}@callee_unknown_use2.{{[0-9]}}
+; CHECK-NOT: alloca
+; CHECK: call void @llvm.lifetime
+bb:
+ %tmp = alloca i32, align 4
+ %tmp1 = bitcast i32* %tmp to i8*
+ %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+ %tmp3 = add nsw i32 %tmp2, 1
+ %tmp4 = icmp slt i32 %arg, 0
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2
+ store i32 %tmp3, i32* %tmp, align 4, !tbaa !2
+ store i32 %tmp3, i32* @g, align 4, !tbaa !2
+ call void @bar(i32* nonnull %tmp) #2
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb
+ %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ]
+ %tmp10 = bitcast i8* %tmp1 to i32*
+ ret i32 %tmp7
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @bar(i32*) local_unnamed_addr #2
+declare void @bar2(i32*, i32*) local_unnamed_addr #1
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 @callee_unknown_use2(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind uwtable}
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
+
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
new file mode 100644
index 0000000000000..e8a4d1281a237
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s
+; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s
+define i32 @test(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 (...) @bar() #1
+ %tmp1 = icmp slt i32 %arg, 0
+ br i1 %tmp1, label %bb6, label %bb2
+
+bb2: ; preds = %bb
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ %tmp3 = tail call i32 (...) @bar() #1
+ %tmp4 = icmp eq i32 %tmp3, 10
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb2
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb2, %bb
+ %tmp7 = phi i32 [ %tmp, %bb5 ], [ 0, %bb ], [ %tmp, %bb2 ]
+ ret i32 %tmp7
+}
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+declare void @foo(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind uwtable
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL: @dummy_caller
+; CHECK: codeRepl.i:
+; CHECK: call void @test.1_bb2()
+; CHECK-NOT: load
+; CHECK br
+
+bb:
+ %tmp = tail call i32 @test(i32 %arg)
+ ret i32 %tmp
+}
+
+; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK: .exitStub:
+; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out
+; CHECK: ret
+
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind uwtable }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
new file mode 100644
index 0000000000000..a48ff4b1b8f99
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s
+; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s
+
+define i32 @test(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = tail call i32 (...) @bar() #1
+ %tmp1 = icmp slt i32 %arg, 0
+ br i1 %tmp1, label %bb6, label %bb2
+
+bb2: ; preds = %bb
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ %tmp3 = tail call i32 (...) @bar() #1
+ %tmp4 = icmp eq i32 %tmp3, 10
+ br i1 %tmp4, label %bb6, label %bb5
+
+bb5: ; preds = %bb2
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ tail call void (...) @foo() #1
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb2, %bb
+ %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ], [ 1, %bb2 ]
+ ret i32 %tmp7
+}
+
+; Function Attrs: nounwind uwtable
+declare i32 @bar(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind uwtable
+declare void @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind uwtable
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+; CHECK-LABEL: @dummy_caller
+; CHECK: codeRepl.i:
+; CHECK: call void @test.1_bb2()
+; CHECK-NOT: load
+; CHECK br
+bb:
+ %tmp = tail call i32 @test(i32 %arg)
+ ret i32 %tmp
+}
+
+; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK: .exitStub:
+; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out
+; CHECK: ret
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303574)"}
diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll
deleted file mode 100644
index b2993657c7f53..0000000000000
--- a/test/Transforms/GVN/PRE/phi-translate-2.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: opt < %s -gvn -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-@a = common global [100 x i64] zeroinitializer, align 16
-@b = common global [100 x i64] zeroinitializer, align 16
-@g1 = common global i64 0, align 8
-@g2 = common global i64 0, align 8
-@g3 = common global i64 0, align 8
-declare i64 @goo(...) local_unnamed_addr #1
-
-define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
-entry:
- %mul = mul nsw i64 %b, %a
- store i64 %mul, i64* @g1, align 8
- %t0 = load i64, i64* @g2, align 8
- %cmp = icmp sgt i64 %t0, 3
- br i1 %cmp, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- %mul2 = mul nsw i64 %d, %c
- store i64 %mul2, i64* @g2, align 8
- br label %if.end
-
-; Check phi-translate works and mul is removed.
-; CHECK-LABEL: @test1(
-; CHECK: if.end:
-; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ]
-; CHECK-NOT: = mul
-; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
-if.end: ; preds = %if.then, %entry
- %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ]
- %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ]
- %mul3 = mul nsw i64 %a.addr.0, %b.addr.0
- store i64 %mul3, i64* @g3, align 8
- ret void
-}
-
-define void @test2(i64 %i) {
-entry:
- %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i
- %t0 = load i64, i64* %arrayidx, align 8
- %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i
- %t1 = load i64, i64* %arrayidx1, align 8
- %mul = mul nsw i64 %t1, %t0
- store i64 %mul, i64* @g1, align 8
- %cmp = icmp sgt i64 %mul, 3
- br i1 %cmp, label %if.then, label %if.end
-
-; Check phi-translate works for the phi generated by loadpre. A new mul will be
-; inserted in if.then block.
-; CHECK-LABEL: @test2(
-; CHECK: if.then:
-; CHECK: %[[MUL_THEN:.*]] = mul
-; CHECK: br label %if.end
-if.then: ; preds = %entry
- %call = tail call i64 (...) @goo() #2
- store i64 %call, i64* @g2, align 8
- br label %if.end
-
-; CHECK: if.end:
-; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ]
-; CHECK-NOT: = mul
-; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
-if.end: ; preds = %if.then, %entry
- %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ]
- %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0
- %t2 = load i64, i64* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0
- %t3 = load i64, i64* %arrayidx4, align 8
- %mul5 = mul nsw i64 %t3, %t2
- store i64 %mul5, i64* @g3, align 8
- ret void
-}
-
-; Check phi-translate doesn't go through backedge, which may lead to incorrect
-; pre transformation.
-; CHECK: for.end:
-; CHECK-NOT: %{{.*pre-phi}} = phi
-; CHECK: ret void
-define void @test3(i64 %N, i64* nocapture readonly %a) {
-entry:
- br label %for.cond
-
-for.cond: ; preds = %for.body, %entry
- %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ]
- %add = add nuw nsw i64 %i.0, 1
- %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add
- %tmp0 = load i64, i64* %arrayidx, align 8
- %cmp = icmp slt i64 %i.0, %N
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %call = tail call i64 (...) @goo() #2
- %add1 = sub nsw i64 0, %call
- %tobool = icmp eq i64 %tmp0, %add1
- br i1 %tobool, label %for.cond, label %for.end
-
-for.end: ; preds = %for.body, %for.cond
- %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ]
- %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa
- %tmp1 = load i64, i64* %arrayidx2, align 8
- store i64 %tmp1, i64* @g1, align 8
- ret void
-}
-
diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll
index 1b2b4d20d31da..9eec8bb6455b4 100644
--- a/test/Transforms/GVN/PRE/pre-gep-load.ll
+++ b/test/Transforms/GVN/PRE/pre-gep-load.ll
@@ -37,7 +37,7 @@ sw.bb2: ; preds = %if.end, %entry
%3 = load double, double* %arrayidx5, align 8
; CHECK: sw.bb2:
; CHECK-NOT: sext
-; CHECK: phi double [
+; CHECK-NEXT: phi double [
; CHECK-NOT: load
%sub6 = fsub double 3.000000e+00, %3
br label %return
diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll
index ffff2b7f08e53..685df24f62b65 100644
--- a/test/Transforms/GVN/PRE/pre-load.ll
+++ b/test/Transforms/GVN/PRE/pre-load.ll
@@ -72,7 +72,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
@@ -104,7 +104,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
@@ -263,7 +263,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
diff --git a/test/Transforms/Inline/AArch64/gep-cost.ll b/test/Transforms/Inline/AArch64/gep-cost.ll
index 204958f082dd6..7d191d37f1fc7 100644
--- a/test/Transforms/Inline/AArch64/gep-cost.ll
+++ b/test/Transforms/Inline/AArch64/gep-cost.ll
@@ -4,11 +4,21 @@
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
-define void @outer([4 x i32]* %ptr, i32 %i) {
+define void @outer1([4 x i32]* %ptr, i32 %i) {
call void @inner1([4 x i32]* %ptr, i32 %i)
+ ret void
+}
+
+define void @outer2([4 x i32]* %ptr, i32 %i) {
call void @inner2([4 x i32]* %ptr, i32 %i)
ret void
}
+
+define void @outer3([4 x i32]* %ptr, i32 %j) {
+ call void @inner3([4 x i32]* %ptr, i32 0, i32 %j)
+ ret void
+}
+
; The gep in inner1() is reg+reg, which is a legal addressing mode for AArch64.
; Thus, both the gep and ret can be simplified.
; CHECK: Analyzing call of inner1
@@ -19,7 +29,7 @@ define void @inner1([4 x i32]* %ptr, i32 %i) {
ret void
}
-; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for
+; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for
; AArch64. Thus, only the ret can be simplified and not the gep.
; CHECK: Analyzing call of inner2
; CHECK: NumInstructionsSimplified: 1
@@ -28,3 +38,14 @@ define void @inner2([4 x i32]* %ptr, i32 %i) {
%G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 1, i32 %i
ret void
}
+
+; The gep in inner3() is reg+reg because %i is a known constant from the
+; callsite. This case is a legal addressing mode for AArch64. Thus, both the
+; gep and ret can be simplified.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 2
+define void @inner3([4 x i32]* %ptr, i32 %i, i32 %j) {
+ %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 %i, i32 %j
+ ret void
+}
diff --git a/test/Transforms/InstCombine/ctpop.ll b/test/Transforms/InstCombine/ctpop.ll
index 6bc6f9731979b..d49a907ffce1d 100644
--- a/test/Transforms/InstCombine/ctpop.ll
+++ b/test/Transforms/InstCombine/ctpop.ll
@@ -52,3 +52,19 @@ define i1 @test4(i8 %arg) {
%res = icmp eq i8 %cnt, 2
ret i1 %res
}
+
+; Test when the number of possible known bits isn't one less than a power of 2
+; and the compare value is greater but less than the next power of 2.
+; TODO: The icmp is unnecessary given the known bits of the input.
+define i1 @test5(i32 %arg) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[ARG:%.*]], 3
+; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.ctpop.i32(i32 [[AND]])
+; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3
+; CHECK-NEXT: ret i1 [[RES]]
+;
+ %and = and i32 %arg, 3
+ %cnt = call i32 @llvm.ctpop.i32(i32 %and)
+ %res = icmp eq i32 %cnt, 3
+ ret i1 %res
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 5654b265da586..78c98955353e5 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -305,6 +305,20 @@ define i1 @cttz_knownbits2(i32 %arg) {
ret i1 %res
}
+; TODO: The icmp is unnecessary given the known bits of the input.
+define i1 @cttz_knownbits3(i32 %arg) {
+; CHECK-LABEL: @cttz_knownbits3(
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG:%.*]], 4
+; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) #2
+; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3
+; CHECK-NEXT: ret i1 [[RES]]
+;
+ %or = or i32 %arg, 4
+ %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
+ %res = icmp eq i32 %cnt, 3
+ ret i1 %res
+}
+
define i8 @ctlz(i8 %a) {
; CHECK-LABEL: @ctlz(
; CHECK-NEXT: ret i8 2
@@ -338,6 +352,20 @@ define i1 @ctlz_knownbits2(i8 %arg) {
ret i1 %res
}
+; TODO: The icmp is unnecessary given the known bits of the input.
+define i1 @ctlz_knownbits3(i8 %arg) {
+; CHECK-LABEL: @ctlz_knownbits3(
+; CHECK-NEXT: [[OR:%.*]] = or i8 [[ARG:%.*]], 32
+; CHECK-NEXT: [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) #2
+; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 3
+; CHECK-NEXT: ret i1 [[RES]]
+;
+ %or = or i8 %arg, 32
+ %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
+ %res = icmp eq i8 %cnt, 3
+ ret i1 %res
+}
+
define void @cmp.simplify(i32 %a, i32 %b, i1* %c) {
%lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone
%lz.cmp = icmp eq i32 %lz, 32
diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
new file mode 100644
index 0000000000000..247ea35ff5d0a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -0,0 +1,49 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: all_scalar
+; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
+;
+define void @all_scalar(i64* %a, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr i64, i64* %a, i64 %i
+ store i64 0, i64* %tmp0, align 1
+ %i.next = add nuw nsw i64 %i, 2
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; CHECK-LABEL: PR33193
+; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
+; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
+%struct.a = type { i32, i8 }
+define void @PR33193(%struct.a* %a, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
+ %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1
+ store i8 0, i8* %tmp0, align 4
+ %j.next = add i32 %j, 1
+ %i.next = zext i32 %j.next to i64
+ %cond = icmp ugt i64 %n, %i.next
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
diff --git a/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
new file mode 100644
index 0000000000000..736ddc32856c0
--- /dev/null
+++ b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
@@ -0,0 +1,104 @@
+; RUN: opt -lower-expect -S -o - < %s | FileCheck %s
+; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s
+
+define i32 @foo(i32 %arg) #0 {
+; CHECK-LABEL: @foo(i32{{.*}})
+bb:
+ %tmp = sext i32 %arg to i64
+ %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4)
+ %tmp2 = icmp ne i64 %tmp1, 0
+ br i1 %tmp2, label %bb3, label %bb5
+; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY:![0-9]+]]
+
+bb3: ; preds = %bb
+ %tmp4 = call i32 (...) @bar()
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret i32 1
+}
+
+define i32 @foo2(i32 %arg) #0 {
+; CHECK-LABEL: @foo2
+bb:
+ %tmp = sext i32 %arg to i64
+ %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4)
+ %tmp2 = icmp eq i64 %tmp1, 2
+ br i1 %tmp2, label %bb3, label %bb5
+; CHECK: br i1 %tmp2{{.*}}!prof [[UNLIKELY:![0-9]+]]
+
+bb3: ; preds = %bb
+ %tmp4 = call i32 (...) @bar()
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret i32 1
+}
+
+define i32 @foo3(i32 %arg) #0 {
+; CHECK-LABEL: @foo3
+bb:
+ %tmp = sext i32 %arg to i64
+ %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4)
+ %tmp2 = icmp eq i64 %tmp1, 4
+ br i1 %tmp2, label %bb3, label %bb5
+; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]]
+
+bb3: ; preds = %bb
+ %tmp4 = call i32 (...) @bar()
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret i32 1
+}
+
+define i32 @foo4(i32 %arg) #0 {
+; CHECK-LABEL: @foo4
+bb:
+ %tmp = sext i32 %arg to i64
+ %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4)
+ %tmp2 = icmp ne i64 %tmp1, 2
+ br i1 %tmp2, label %bb3, label %bb5
+; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]]
+
+bb3: ; preds = %bb
+ %tmp4 = call i32 (...) @bar()
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret i32 1
+}
+
+define i32 @foo5(i32 %arg, i32 %arg1) #0 {
+; CHECK-LABEL: @foo5
+bb:
+ %tmp = sext i32 %arg1 to i64
+ %tmp2 = call i64 @llvm.expect.i64(i64 %tmp, i64 4)
+ %tmp3 = sext i32 %arg to i64
+ %tmp4 = icmp ne i64 %tmp2, %tmp3
+ br i1 %tmp4, label %bb5, label %bb7
+; CHECK-NOT: !prof
+
+bb5: ; preds = %bb
+ %tmp6 = call i32 (...) @bar()
+ br label %bb7
+
+bb7: ; preds = %bb5, %bb
+ ret i32 1
+}
+
+declare i64 @llvm.expect.i64(i64, i64) #1
+
+declare i32 @bar(...) local_unnamed_addr #0
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304373)"}
+; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000}
+
diff --git a/test/Transforms/NewGVN/completeness.ll b/test/Transforms/NewGVN/completeness.ll
index bafe5f966d22a..2b28f12df9d12 100644
--- a/test/Transforms/NewGVN/completeness.ll
+++ b/test/Transforms/NewGVN/completeness.ll
@@ -389,6 +389,23 @@ bb6: ; preds = %bb6, %bb2
;; Ensure that we revisit predicateinfo operands at the right points in time.
define void @test10() {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: b:
+; CHECK-NEXT: br label [[G:%.*]]
+; CHECK: g:
+; CHECK-NEXT: [[N:%.*]] = phi i32* [ [[H:%.*]], [[I:%.*]] ], [ null, [[B:%.*]] ]
+; CHECK-NEXT: [[H]] = getelementptr i32, i32* [[N]], i64 1
+; CHECK-NEXT: [[J:%.*]] = icmp eq i32* [[H]], getelementptr (i32, i32* null, i64 8)
+; CHECK-NEXT: br i1 [[J]], label [[C:%.*]], label [[I]]
+; CHECK: i:
+; CHECK-NEXT: br i1 undef, label [[K:%.*]], label [[G]]
+; CHECK: k:
+; CHECK-NEXT: br i1 false, label [[C]], label [[O:%.*]]
+; CHECK: o:
+; CHECK-NEXT: br label [[C]]
+; CHECK: c:
+; CHECK-NEXT: ret void
+;
b:
%m = getelementptr i32, i32* null, i64 8
br label %g
diff --git a/test/Transforms/NewGVN/pr33185.ll b/test/Transforms/NewGVN/pr33185.ll
new file mode 100644
index 0000000000000..c687d8fe51eba
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33185.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn -S %s | FileCheck %s
+
+@a = local_unnamed_addr global i32 9, align 4
+@.str4 = private unnamed_addr constant [6 x i8] c"D:%d\0A\00", align 1
+
+define i32 @main() local_unnamed_addr {
+; CHECK-LABEL: @main(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT: [[CMP1_I:%.*]] = icmp ne i32 [[TMP]], 0
+; CHECK-NEXT: br label [[FOR_BODY_I:%.*]]
+; CHECK: for.body.i:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[COND_END_I:%.*]] ]
+; CHECK-NEXT: [[F_08_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[COND_END_I]] ]
+; CHECK-NEXT: [[MUL_I:%.*]] = select i1 [[CMP1_I]], i32 [[F_08_I]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[COND_END_I]], label [[COND_TRUE_I:%.*]]
+; CHECK: cond.true.i:
+; CHECK-NEXT: [[DIV_I:%.*]] = udiv i32 [[MUL_I]], [[F_08_I]]
+; CHECK-NEXT: br label [[COND_END_I]]
+; CHECK: cond.end.i:
+; CHECK-NEXT: [[COND_I:%.*]] = phi i32 [ [[DIV_I]], [[COND_TRUE_I]] ], [ 0, [[FOR_BODY_I]] ]
+; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[F_08_I]], 1
+; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i32 [[INC_I]], 4
+; CHECK-NEXT: br i1 [[EXITCOND_I]], label [[FN1_EXIT:%.*]], label [[FOR_BODY_I]]
+; CHECK: fn1.exit:
+; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 [[COND_I]])
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %tmp = load i32, i32* @a, align 4
+ %cmp1.i = icmp ne i32 %tmp, 0
+ br label %for.body.i
+
+for.body.i:
+ %tmp1 = phi i1 [ true, %entry ], [ false, %cond.end.i ]
+ %f.08.i = phi i32 [ 0, %entry ], [ %inc.i, %cond.end.i ]
+ %mul.i = select i1 %cmp1.i, i32 %f.08.i, i32 0
+ br i1 %tmp1, label %cond.end.i, label %cond.true.i
+
+cond.true.i:
+ ;; Ensure we don't replace this divide with a phi of ops that merges the wrong loop iteration value
+ %div.i = udiv i32 %mul.i, %f.08.i
+ br label %cond.end.i
+
+cond.end.i:
+ %cond.i = phi i32 [ %div.i, %cond.true.i ], [ 0, %for.body.i ]
+ %inc.i = add nuw nsw i32 %f.08.i, 1
+ %exitcond.i = icmp eq i32 %inc.i, 4
+ br i1 %exitcond.i, label %fn1.exit, label %for.body.i
+
+fn1.exit:
+ %cond.i.lcssa = phi i32 [ %cond.i, %cond.end.i ]
+ %call4= tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 %cond.i.lcssa)
+ ret i32 0
+}
+
+declare i32 @printf(i8* nocapture readonly, ...)
+
diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll
index 3db7566d50789..f675b1f1a0118 100644
--- a/test/Transforms/PGOProfile/branch1.ll
+++ b/test/Transforms/PGOProfile/branch1.ll
@@ -15,6 +15,9 @@
; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.l.profdata -S | FileCheck %s --check-prefix=USE-LARGE
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS
+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; GEN-DARWIN-LINKONCE: target triple = "x86_64-apple-darwin"
@@ -54,3 +57,5 @@ if.end:
; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3}
+
+; ANALYSIS:remark: <unknown>:0:0: sgt_i32_Zero {{.*}}66.67% (total count : 3)
diff --git a/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll
new file mode 100644
index 0000000000000..03facd072b347
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll
@@ -0,0 +1,9 @@
+; RUN: opt -passes='no-op-module' -debug-pass-manager -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s 2>&1 | FileCheck %s --check-prefix=DEBUG_PM
+; RUN: llvm-bcanalyzer -dump %t2 | FileCheck %s --check-prefix=BITCODE
+
+; DEBUG_PM: ThinLTOBitcodeWriterPass
+; BITCODE: Foo
+
+define void @Foo() {
+ ret void
+}
diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll
index 79c76baa6f619..61f59f03e1bc2 100644
--- a/test/Transforms/Util/PredicateInfo/condprop.ll
+++ b/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -print-predicateinfo -analyze < %s 2>&1 | FileCheck %s
+; RUN: opt -print-predicateinfo -analyze -reverse-iterate < %s 2>&1 | FileCheck %s
@a = external global i32 ; <i32*> [#uses=7]
@@ -98,10 +99,10 @@ define void @test3(i32 %x, i32 %y) {
; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
-; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
; CHECK: both_zero:
@@ -382,8 +383,8 @@ ret:
define i32 @test10(i32 %j, i32 %i) {
; CHECK-LABEL: @test10(
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
-; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
; CHECK: cond_true:
; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll
index 5942ed155318c..43c508670908b 100644
--- a/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
+; RUN: opt -print-predicateinfo -reverse-iterate < %s 2>&1 | FileCheck %s
declare void @foo(i1)
declare void @bar(i32)
@@ -10,10 +11,10 @@ define void @testor(i32 %x, i32 %y) {
; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
-; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
-; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
; CHECK: oneof:
@@ -54,10 +55,10 @@ define void @testand(i32 %x, i32 %y) {
; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
-; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
; CHECK: both:
@@ -98,9 +99,9 @@ define void @testandsame(i32 %x, i32 %y) {
; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100
; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
-; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]])
; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
; CHECK: [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]])
+; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]])
; CHECK: [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]])
; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
@@ -136,23 +137,23 @@ define void @testandassume(i32 %x, i32 %y) {
; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
-; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
-; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
-; CHECK: [[TMP4:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK: [[TMP1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
; CHECK-NEXT: call void @llvm.assume(i1 [[TMP5]])
-; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]])
+; CHECK: [[DOT0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP1]])
; CHECK: [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]])
; CHECK: [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]])
-; CHECK: [[DOT03:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP4]])
+; CHECK: [[DOT03:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP4]])
; CHECK: [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]])
; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
; CHECK: both:
-; CHECK-NEXT: call void @foo(i1 [[DOT0]])
; CHECK-NEXT: call void @foo(i1 [[DOT02]])
+; CHECK-NEXT: call void @foo(i1 [[DOT03]])
+; CHECK-NEXT: call void @bar(i32 [[DOT0]])
; CHECK-NEXT: call void @bar(i32 [[DOT01]])
-; CHECK-NEXT: call void @bar(i32 [[DOT03]])
; CHECK-NEXT: ret void
; CHECK: nope:
; CHECK-NEXT: call void @foo(i1 [[DOT04]])
diff --git a/test/tools/llvm-config/cflags.test b/test/tools/llvm-config/cflags.test
index ef3e486bd968a..461de86b64c0b 100644
--- a/test/tools/llvm-config/cflags.test
+++ b/test/tools/llvm-config/cflags.test
@@ -4,4 +4,4 @@ RUN: llvm-config --cxxflags 2>&1 | FileCheck %s
CHECK: -I
CHECK: {{[/\\]}}include
CHECK-NOT: error:
-CHECK-NOT: warning
+CHECK-NOT: warning:
diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.rc b/test/tools/llvm-cvtres/Inputs/test_resource.rc
index fd616520dbe1b..5ca097baa0f73 100644
--- a/test/tools/llvm-cvtres/Inputs/test_resource.rc
+++ b/test/tools/llvm-cvtres/Inputs/test_resource.rc
@@ -42,3 +42,9 @@ LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
MENUITEM "salad", 101
MENUITEM "duck", 102
}
+
+
+myresource stringarray {
+ "this is a user defined resource\0",
+ "it contains many strings\0",
+} \ No newline at end of file
diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.res b/test/tools/llvm-cvtres/Inputs/test_resource.res
index c577ecc3d6333..d422bb4904da4 100644
--- a/test/tools/llvm-cvtres/Inputs/test_resource.res
+++ b/test/tools/llvm-cvtres/Inputs/test_resource.res
Binary files differ
diff --git a/test/tools/llvm-cvtres/resource.test b/test/tools/llvm-cvtres/resource.test
index 16970343c60dd..b9be74bf671b2 100644
--- a/test/tools/llvm-cvtres/resource.test
+++ b/test/tools/llvm-cvtres/resource.test
@@ -4,4 +4,48 @@
RUN: llvm-cvtres %p/Inputs/test_resource.res | FileCheck %s
-CHECK: Number of resources: 7
+CHECK: Number of resources: 8
+CHECK-NEXT: Resource Tree [
+CHECK-NEXT: STRINGARRAY [
+CHECK-NEXT: MYRESOURCE [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 2 [
+CHECK-NEXT: CURSOR [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: OKAY [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 4 [
+CHECK-NEXT: "EAT" [
+CHECK-NEXT: 3081 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 14432 [
+CHECK-NEXT: 2052 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 5 [
+CHECK-NEXT: TESTDIALOG [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 9 [
+CHECK-NEXT: MYACCELERATORS [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: 12 [
+CHECK-NEXT: 1033 [
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]
+CHECK-NEXT: ]