aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-08-07 23:01:33 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-08-07 23:01:33 +0000
commitee8648bdac07986a0f1ec897b02ec82a2f144d46 (patch)
tree52d1861acda1205241ee35a94aa63129c604d469 /test/CodeGen
parent1a82d4c088707c791c792f6822f611b47a12bdfe (diff)
Notes
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll60
-rw-r--r--test/CodeGen/AArch64/arm64-nvcast.ll29
-rw-r--r--test/CodeGen/AArch64/arm64-shrink-wrapping.ll39
-rw-r--r--test/CodeGen/AArch64/nest-register.ll23
-rw-r--r--test/CodeGen/AArch64/xbfiz.ll33
-rw-r--r--test/CodeGen/AMDGPU/array-ptr-calc-i64.ll5
-rw-r--r--test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll5
-rw-r--r--test/CodeGen/AMDGPU/ds_read2.ll4
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_offset_order.ll15
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_superreg.ll246
-rw-r--r--test/CodeGen/AMDGPU/ds_read2st64.ll2
-rw-r--r--test/CodeGen/AMDGPU/ds_write2.ll17
-rw-r--r--test/CodeGen/AMDGPU/ds_write2st64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.ll30
-rw-r--r--test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll35
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.round.ll4
-rw-r--r--test/CodeGen/AMDGPU/mad-combine.ll25
-rw-r--r--test/CodeGen/AMDGPU/mad-sub.ll6
-rw-r--r--test/CodeGen/AMDGPU/madak.ll12
-rw-r--r--test/CodeGen/AMDGPU/madmk.ll10
-rw-r--r--test/CodeGen/AMDGPU/mul_uint24.ll20
-rw-r--r--test/CodeGen/AMDGPU/select-vectors.ll116
-rw-r--r--test/CodeGen/AMDGPU/select64.ll4
-rw-r--r--test/CodeGen/AMDGPU/shl.ll38
-rw-r--r--test/CodeGen/AMDGPU/sint_to_fp.f64.ll8
-rw-r--r--test/CodeGen/AMDGPU/srl.ll35
-rw-r--r--test/CodeGen/AMDGPU/uint_to_fp.f64.ll10
-rw-r--r--test/CodeGen/AMDGPU/v_mac.ll155
-rw-r--r--test/CodeGen/AMDGPU/vselect.ll34
-rw-r--r--test/CodeGen/AMDGPU/xor.ll4
-rw-r--r--test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll3
-rw-r--r--test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll4
-rw-r--r--test/CodeGen/ARM/Windows/hard-float.ll10
-rw-r--r--test/CodeGen/ARM/Windows/long-calls.ll2
-rw-r--r--test/CodeGen/ARM/Windows/no-arm-mode.ll9
-rw-r--r--test/CodeGen/ARM/Windows/pic.ll17
-rw-r--r--test/CodeGen/ARM/Windows/structors.ll12
-rw-r--r--test/CodeGen/ARM/Windows/trivial-gnu-object.ll10
-rw-r--r--test/CodeGen/ARM/arm-returnaddr.ll6
-rw-r--r--test/CodeGen/ARM/byval-align.ll3
-rw-r--r--test/CodeGen/ARM/cttz.ll90
-rw-r--r--test/CodeGen/ARM/cttz_vector.ll383
-rw-r--r--test/CodeGen/ARM/ctz.ll11
-rw-r--r--test/CodeGen/ARM/fast-isel-call.ll6
-rw-r--r--test/CodeGen/ARM/fast-isel-intrinsic.ll6
-rw-r--r--test/CodeGen/ARM/fast-isel-static.ll4
-rw-r--r--test/CodeGen/ARM/ldrd.ll69
-rw-r--r--test/CodeGen/ARM/memset-inline.ll3
-rw-r--r--test/CodeGen/ARM/nest-register.ll21
-rw-r--r--test/CodeGen/ARM/subtarget-features-long-calls.ll49
-rw-r--r--test/CodeGen/ARM/wrong-t2stmia-size-opt.ll16
-rw-r--r--test/CodeGen/Generic/run-pass.ll7
-rw-r--r--test/CodeGen/Hexagon/Atomics.ll71
-rw-r--r--test/CodeGen/Hexagon/common-gep-basic.ll37
-rw-r--r--test/CodeGen/Hexagon/common-gep-icm.ll76
-rw-r--r--test/CodeGen/Hexagon/extract-basic.ll76
-rw-r--r--test/CodeGen/Hexagon/fusedandshift.ll3
-rw-r--r--test/CodeGen/Hexagon/insert-basic.ll66
-rw-r--r--test/CodeGen/Hexagon/predicate-logical.ll30
-rw-r--r--test/CodeGen/Hexagon/predicate-rcmp.ll19
-rw-r--r--test/CodeGen/MIR/X86/basic-block-liveins.mir25
-rw-r--r--test/CodeGen/MIR/X86/dead-register-flag.mir26
-rw-r--r--test/CodeGen/MIR/X86/expected-different-implicit-operand.mir38
-rw-r--r--test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir38
-rw-r--r--test/CodeGen/MIR/X86/expected-named-register-livein.mir21
-rw-r--r--test/CodeGen/MIR/X86/expected-number-after-bb.mir6
-rw-r--r--test/CodeGen/MIR/X86/expected-register-after-flags.mir22
-rw-r--r--test/CodeGen/MIR/X86/expected-subregister-after-colon.mir29
-rw-r--r--test/CodeGen/MIR/X86/fixed-stack-objects.mir35
-rw-r--r--test/CodeGen/MIR/X86/global-value-operands.mir4
-rw-r--r--test/CodeGen/MIR/X86/implicit-register-flag.mir41
-rw-r--r--test/CodeGen/MIR/X86/killed-register-flag.mir42
-rw-r--r--test/CodeGen/MIR/X86/large-index-number-error.mir6
-rw-r--r--test/CodeGen/MIR/X86/machine-basic-block-operands.mir12
-rw-r--r--test/CodeGen/MIR/X86/machine-instructions.mir4
-rw-r--r--test/CodeGen/MIR/X86/missing-implicit-operand.mir40
-rw-r--r--test/CodeGen/MIR/X86/named-registers.mir2
-rw-r--r--test/CodeGen/MIR/X86/register-mask-operands.mir10
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir32
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir32
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir34
-rw-r--r--test/CodeGen/MIR/X86/stack-objects.mir39
-rw-r--r--test/CodeGen/MIR/X86/subregister-operands.mir33
-rw-r--r--test/CodeGen/MIR/X86/undef-register-flag.mir42
-rw-r--r--test/CodeGen/MIR/X86/undefined-register-class.mir26
-rw-r--r--test/CodeGen/MIR/X86/undefined-virtual-register.mir28
-rw-r--r--test/CodeGen/MIR/X86/unknown-machine-basic-block.mir6
-rw-r--r--test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir6
-rw-r--r--test/CodeGen/MIR/X86/unknown-subregister-index.mir31
-rw-r--r--test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir36
-rw-r--r--test/CodeGen/MIR/X86/variable-sized-stack-objects.mir42
-rw-r--r--test/CodeGen/MIR/X86/virtual-registers.mir105
-rw-r--r--test/CodeGen/MIR/frame-info.mir91
-rw-r--r--test/CodeGen/MIR/llvmIR.mir2
-rw-r--r--test/CodeGen/MIR/llvmIRMissing.mir2
-rw-r--r--test/CodeGen/MIR/machine-basic-block-unknown-name.mir2
-rw-r--r--test/CodeGen/MIR/machine-function-missing-body-error.mir15
-rw-r--r--test/CodeGen/MIR/machine-function-missing-function.mir4
-rw-r--r--test/CodeGen/MIR/machine-function-missing-name.mir4
-rw-r--r--test/CodeGen/MIR/machine-function.mir8
-rw-r--r--test/CodeGen/MIR/register-info.mir4
-rw-r--r--test/CodeGen/NVPTX/loop-vectorize.ll39
-rw-r--r--test/CodeGen/NVPTX/lower-aggr-copies.ll47
-rw-r--r--test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll30
-rw-r--r--test/CodeGen/PowerPC/ppc-crbits-onoff.ll4
-rw-r--r--test/CodeGen/PowerPC/ppc32-nest.ll26
-rw-r--r--test/CodeGen/PowerPC/ppc64-anyregcc.ll16
-rw-r--r--test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll6
-rw-r--r--test/CodeGen/PowerPC/ppc64-fastcc.ll10
-rw-r--r--test/CodeGen/PowerPC/ppc64-nest.ll42
-rw-r--r--test/CodeGen/PowerPC/ppc64-patchpoint.ll20
-rw-r--r--test/CodeGen/PowerPC/ppc64-stackmap.ll14
-rw-r--r--test/CodeGen/PowerPC/recipest.ll15
-rw-r--r--test/CodeGen/PowerPC/sjlj.ll8
-rw-r--r--test/CodeGen/PowerPC/swaps-le-3.ll4
-rw-r--r--test/CodeGen/PowerPC/swaps-le-5.ll70
-rw-r--r--test/CodeGen/PowerPC/tls-store2.ll4
-rw-r--r--test/CodeGen/PowerPC/vsx-elementary-arith.ll31
-rw-r--r--test/CodeGen/PowerPC/vsx-fma-m.ll30
-rw-r--r--test/CodeGen/PowerPC/vsx-fma-sp.ll13
-rw-r--r--test/CodeGen/SPARC/basictest.ll2
-rw-r--r--test/CodeGen/SPARC/multiple-div.ll21
-rw-r--r--test/CodeGen/Thumb2/aapcs.ll6
-rw-r--r--test/CodeGen/WebAssembly/lit.local.cfg2
-rw-r--r--test/CodeGen/WinEH/cppeh-alloca-sink.ll8
-rw-r--r--test/CodeGen/WinEH/cppeh-catch-all-win32.ll86
-rw-r--r--test/CodeGen/WinEH/cppeh-catch-and-throw.ll4
-rw-r--r--test/CodeGen/WinEH/cppeh-catch-scalar.ll4
-rw-r--r--test/CodeGen/WinEH/cppeh-catch-unwind.ll8
-rw-r--r--test/CodeGen/WinEH/cppeh-frame-vars.ll12
-rw-r--r--test/CodeGen/WinEH/cppeh-inalloca.ll12
-rw-r--r--test/CodeGen/WinEH/cppeh-min-unwind.ll4
-rw-r--r--test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll4
-rw-r--r--test/CodeGen/WinEH/cppeh-multi-catch.ll8
-rw-r--r--test/CodeGen/WinEH/cppeh-nested-1.ll6
-rw-r--r--test/CodeGen/WinEH/cppeh-nested-2.ll10
-rw-r--r--test/CodeGen/WinEH/cppeh-nested-3.ll10
-rw-r--r--test/CodeGen/WinEH/cppeh-nested-rethrow.ll4
-rw-r--r--test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll16
-rw-r--r--test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll9
-rw-r--r--test/CodeGen/WinEH/cppeh-prepared-catch.ll130
-rw-r--r--test/CodeGen/WinEH/cppeh-prepared-cleanups.ll14
-rw-r--r--test/CodeGen/WinEH/cppeh-shared-empty-catch.ll2
-rw-r--r--test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll2
-rw-r--r--test/CodeGen/WinEH/cppeh-state-calc-1.ll10
-rw-r--r--test/CodeGen/WinEH/seh-exception-code.ll66
-rw-r--r--test/CodeGen/WinEH/seh-exception-code2.ll91
-rw-r--r--test/CodeGen/WinEH/seh-inlined-finally.ll18
-rw-r--r--test/CodeGen/WinEH/seh-outlined-finally-win32.ll172
-rw-r--r--test/CodeGen/WinEH/seh-outlined-finally.ll10
-rw-r--r--test/CodeGen/WinEH/seh-prepared-basic.ll6
-rw-r--r--test/CodeGen/WinEH/seh-simple.ll34
-rw-r--r--test/CodeGen/X86/avx-vperm2x128.ll20
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll50
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll41
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll80
-rw-r--r--test/CodeGen/X86/cppeh-nounwind.ll35
-rw-r--r--test/CodeGen/X86/eh-nolandingpads.ll12
-rw-r--r--test/CodeGen/X86/fdiv-combine.ll21
-rw-r--r--test/CodeGen/X86/frameescape.ll12
-rw-r--r--test/CodeGen/X86/frameregister.ll30
-rw-r--r--test/CodeGen/X86/implicit-null-check-negative.ll42
-rw-r--r--test/CodeGen/X86/implicit-null-check.ll44
-rw-r--r--test/CodeGen/X86/inline-asm-bad-constraint-n.ll10
-rw-r--r--test/CodeGen/X86/legalize-shl-vec.ll44
-rw-r--r--test/CodeGen/X86/machine-combiner.ll197
-rw-r--r--test/CodeGen/X86/pr13577.ll17
-rw-r--r--test/CodeGen/X86/read-fp-no-frame-pointer.ll12
-rw-r--r--test/CodeGen/X86/seh-catch-all-win32.ll26
-rw-r--r--test/CodeGen/X86/seh-except-finally.ll6
-rw-r--r--test/CodeGen/X86/seh-stack-realign-win32.ll99
-rw-r--r--test/CodeGen/X86/seh-stack-realign.ll101
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll8
-rw-r--r--test/CodeGen/X86/sse2-vector-shifts.ll2
-rw-r--r--test/CodeGen/X86/sse3.ll2
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx1.ll18
-rw-r--r--test/CodeGen/X86/stack-folding-fp-sse42.ll38
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll9
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll145
-rw-r--r--test/CodeGen/X86/vector-gep.ll22
-rw-r--r--test/CodeGen/X86/vector-sext.ll28
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll1284
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll142
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll791
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll137
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll550
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll4
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse4a.ll221
-rw-r--r--test/CodeGen/X86/vector-trunc.ll16
-rw-r--r--test/CodeGen/X86/vector-zext.ll80
-rw-r--r--test/CodeGen/X86/vector-zmov.ll34
-rw-r--r--test/CodeGen/X86/visibility.ll6
-rw-r--r--test/CodeGen/X86/vshift-3.ll5
-rw-r--r--test/CodeGen/X86/webkit-jscc.ll18
-rw-r--r--test/CodeGen/X86/widen_conv-2.ll7
-rw-r--r--test/CodeGen/X86/widen_load-2.ll14
-rw-r--r--test/CodeGen/X86/win32-eh.ll44
-rw-r--r--test/CodeGen/X86/win64_frame.ll5
-rw-r--r--test/CodeGen/X86/x86-shrink-wrapping.ll39
200 files changed, 6634 insertions, 2328 deletions
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index a31c66bad4be..739570236da9 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -255,7 +255,7 @@ entry:
; CHECK: ubfx x9, x0, #0, #32
; CHECK: lsl x9, x9, #2
; CHECK: add x9, x9, #15
-; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: and x9, x9, #0x7fffffff0
; CHECK: mov x10, sp
; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
; CHECK: mov sp, x[[VLASPTMP]]
@@ -302,7 +302,7 @@ entry:
; CHECK: ubfx x9, x0, #0, #32
; CHECK: lsl x9, x9, #2
; CHECK: add x9, x9, #15
-; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: and x9, x9, #0x7fffffff0
; CHECK: mov x10, sp
; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
; CHECK: mov sp, x[[VLASPTMP]]
@@ -364,7 +364,7 @@ entry:
; CHECK: ubfx x9, x0, #0, #32
; CHECK: lsl x9, x9, #2
; CHECK: add x9, x9, #15
-; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: and x9, x9, #0x7fffffff0
; CHECK: mov x10, sp
; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
; CHECK: mov sp, x[[VLASPTMP]]
@@ -417,7 +417,7 @@ entry:
; CHECK: ubfx x9, x0, #0, #32
; CHECK: lsl x9, x9, #2
; CHECK: add x9, x9, #15
-; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: and x9, x9, #0x7fffffff0
; CHECK: mov x10, sp
; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
; CHECK: mov sp, x[[VLASPTMP]]
@@ -468,7 +468,7 @@ entry:
; CHECK: ubfx x9, x0, #0, #32
; CHECK: lsl x9, x9, #2
; CHECK: add x9, x9, #15
-; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: and x9, x9, #0x7fffffff0
; CHECK: mov x10, sp
; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
; CHECK: mov sp, x[[VLASPTMP]]
@@ -482,6 +482,56 @@ entry:
; CHECK: ldp x20, x19, [sp], #32
; CHECK: ret
+
+define void @realign_conditional(i1 %b) {
+entry:
+ br i1 %b, label %bb0, label %bb1
+
+bb0:
+ %MyAlloca = alloca i8, i64 64, align 32
+ br label %bb1
+
+bb1:
+ ret void
+}
+
+; CHECK-LABEL: realign_conditional
+; No realignment in the prologue.
+; CHECK-NOT: and
+; CHECK-NOT: 0xffffffffffffffe0
+; CHECK: tbz {{.*}} .[[LABEL:.*]]
+; Stack is realigned in a non-entry BB.
+; CHECK: sub [[REG:x[01-9]+]], sp, #64
+; CHECK: and sp, [[REG]], #0xffffffffffffffe0
+; CHECK: .[[LABEL]]:
+; CHECK: ret
+
+
+define void @realign_conditional2(i1 %b) {
+entry:
+ %tmp = alloca i8, i32 4
+ br i1 %b, label %bb0, label %bb1
+
+bb0:
+ %MyAlloca = alloca i8, i64 64, align 32
+ br label %bb1
+
+bb1:
+ ret void
+}
+
+; CHECK-LABEL: realign_conditional2
+; Extra realignment in the prologue (performance issue).
+; CHECK: sub x9, sp, #32 // =32
+; CHECK: and sp, x9, #0xffffffffffffffe0
+; CHECK: mov x19, sp
+; CHECK: tbz {{.*}} .[[LABEL:.*]]
+; Stack is realigned in a non-entry BB.
+; CHECK: sub [[REG:x[01-9]+]], sp, #64
+; CHECK: and sp, [[REG]], #0xffffffffffffffe0
+; CHECK: .[[LABEL]]:
+; CHECK: ret
+
attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
new file mode 100644
index 000000000000..3cb1bf25fc34
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; CHECK-LABEL: _test:
+; CHECK: fmov.2d v0, #2.00000000
+; CHECK: str q0, [sp]
+; CHECK: mov x8, sp
+; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK: str s0, [x0]
+
+define void @test(float * %p1, i32 %v1) {
+entry:
+ %v2 = extractelement <3 x float> <float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, i32 %v1
+ store float %v2, float* %p1, align 4
+ ret void
+}
+
+; CHECK-LABEL: _test2
+; CHECK: movi.16b v0, #0x3f
+; CHECK: str q0, [sp]
+; CHECK: mov x8, sp
+; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK: str s0, [x0]
+
+define void @test2(float * %p1, i32 %v1) {
+entry:
+ %v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
+ store float %v2, float* %p1, align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index c1777513fa04..599712be401c 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -500,3 +500,42 @@ if.end: ; preds = %if.else, %if.then
}
declare i32 @someVariadicFunc(i32, ...)
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: stp
+;
+; CHECK: and [[TEST:w[0-9]+]], w0, #0xff
+; CHECK-NEXT: cbnz [[TEST]], [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: movz w0, #0x2a
+;
+; DISABLE-NEXT: ldp
+;
+; CHECK-NEXT: ret
+;
+; CHECK: [[ABORT]]: ; %if.abort
+;
+; ENABLE: stp
+;
+; CHECK: bl _abort
+; ENABLE-NOT: ldp
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+ %tobool = icmp eq i8 %bad_thing, 0
+ br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+ tail call void @abort() #0
+ unreachable
+
+if.end:
+ ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }
diff --git a/test/CodeGen/AArch64/nest-register.ll b/test/CodeGen/AArch64/nest-register.ll
new file mode 100644
index 000000000000..9c659fb74ec4
--- /dev/null
+++ b/test/CodeGen/AArch64/nest-register.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register.
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: mov x0, x18
+; CHECK-NEXT: ret
+
+ ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mov x18, x0
+; CHECK-NEXT: bl nest_receiver
+; CHECK: ret
+
+ %result = call i8* @nest_receiver(i8* nest %arg)
+ ret i8* %result
+}
diff --git a/test/CodeGen/AArch64/xbfiz.ll b/test/CodeGen/AArch64/xbfiz.ll
new file mode 100644
index 000000000000..f763400d7f6a
--- /dev/null
+++ b/test/CodeGen/AArch64/xbfiz.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+
+define i64 @sbfiz64(i64 %v) {
+; CHECK-LABEL: sbfiz64:
+; CHECK: sbfiz x0, x0, #1, #16
+ %shl = shl i64 %v, 48
+ %shr = ashr i64 %shl, 47
+ ret i64 %shr
+}
+
+define i32 @sbfiz32(i32 %v) {
+; CHECK-LABEL: sbfiz32:
+; CHECK: sbfiz w0, w0, #1, #14
+ %shl = shl i32 %v, 18
+ %shr = ashr i32 %shl, 17
+ ret i32 %shr
+}
+
+define i64 @ubfiz64(i64 %v) {
+; CHECK-LABEL: ubfiz64:
+; CHECK: ubfiz x0, x0, #36, #11
+ %shl = shl i64 %v, 53
+ %shr = lshr i64 %shl, 17
+ ret i64 %shr
+}
+
+define i32 @ubfiz32(i32 %v) {
+; CHECK-LABEL: ubfiz32:
+; CHECK: ubfiz w0, w0, #6, #24
+ %shl = shl i32 %v, 8
+ %shr = lshr i32 %shl, 2
+ ret i32 %shr
+}
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index eae095eb8449..a3ae3c3aea16 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -3,8 +3,9 @@
declare i32 @llvm.SI.tid() readnone
; SI-LABEL: {{^}}test_array_ptr_calc:
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
%tid = call i32 @llvm.SI.tid() readnone
%a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index e7e13d6178c4..5e4654abd91b 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
declare i32 @llvm.r600.read.tidig.x() #0
declare void @llvm.AMDGPU.barrier.local() #1
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 5929898f8bd8..ec04f8b1acd6 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
; FIXME: We don't get cases where the address was an SGPR because we
; get a copy to the address register for each one.
@lds = addrspace(3) global [512 x float] undef, align 4
- @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
; SI-LABEL: @simple_read2_f32
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 9ea9a5a2617b..d362c46bbf96 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -1,16 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-; XFAIL: *
@lds = addrspace(3) global [512 x float] undef, align 4
+; offset0 is larger than offset1
+
; SI-LABEL: {{^}}offset_order:
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
-; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
+; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
define void @offset_order(float addrspace(1)* %out) {
entry:
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
new file mode 100644
index 000000000000..842c2d8bc339
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -0,0 +1,246 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.v2 = addrspace(3) global [512 x <2 x float>] undef, align 4
+@lds.v3 = addrspace(3) global [512 x <3 x float>] undef, align 4
+@lds.v4 = addrspace(3) global [512 x <4 x float>] undef, align 4
+@lds.v8 = addrspace(3) global [512 x <8 x float>] undef, align 4
+@lds.v16 = addrspace(3) global [512 x <16 x float>] undef, align 4
+
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg_align4:
+; CI: ds_read2_b32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dwordx2 [[RESULT]]
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
+ %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
+ %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+ store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg:
+; CI: ds_read_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dwordx2 [[RESULT]]
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
+ %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
+ %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+ store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; FIXME: Shuffling to new superregister
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]]
+; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]]
+; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
+; CI: buffer_store_dword v[[ADD2]]
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+ %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
+ %elt0 = extractelement <4 x float> %val0, i32 0
+ %elt1 = extractelement <4 x float> %val0, i32 1
+ %elt2 = extractelement <4 x float> %val0, i32 2
+ %elt3 = extractelement <4 x float> %val0, i32 3
+
+ %add0 = fadd float %elt0, %elt2
+ %add1 = fadd float %elt1, %elt3
+ %add2 = fadd float %add0, %add1
+
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+ store float %add2, float addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
+; CI: buffer_store_dword v[[ADD1]]
+; CI: s_endpgm
+define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
+ %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
+ %elt0 = extractelement <3 x float> %val0, i32 0
+ %elt1 = extractelement <3 x float> %val0, i32 1
+ %elt2 = extractelement <3 x float> %val0, i32 2
+
+ %add0 = fadd float %elt0, %elt2
+ %add1 = fadd float %add0, %elt1
+
+ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+ store float %add1, float addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dwordx4
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+ %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
+ %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+ store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dwordx4
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+ %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
+ %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+ store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
+ %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
+ %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i
+ store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
+ %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
+ %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i
+ store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; Do scalar loads into the super register we need.
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-NOT: v_mov
+; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+ %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
+
+ %val0 = load float, float addrspace(3)* %arrayidx0
+ %val1 = load float, float addrspace(3)* %arrayidx1
+
+ %vec.0 = insertelement <2 x float> undef, float %val0, i32 0
+ %vec.1 = insertelement <2 x float> %vec.0, float %val1, i32 1
+
+ %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+ store <2 x float> %vec.1, <2 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; Do scalar loads into the super register we need.
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-NOT: v_mov
+; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+ %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
+ %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2
+ %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 3
+
+ %val0 = load float, float addrspace(3)* %arrayidx0
+ %val1 = load float, float addrspace(3)* %arrayidx1
+ %val2 = load float, float addrspace(3)* %arrayidx2
+ %val3 = load float, float addrspace(3)* %arrayidx3
+
+ %vec.0 = insertelement <4 x float> undef, float %val0, i32 0
+ %vec.1 = insertelement <4 x float> %vec.0, float %val1, i32 1
+ %vec.2 = insertelement <4 x float> %vec.1, float %val2, i32 2
+ %vec.3 = insertelement <4 x float> %vec.2, float %val3, i32 3
+
+ %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+ store <4 x float> %vec.3, <4 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 54b3b45636d6..e2e441214b4a 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index b553d3459e40..d4973e377b59 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -25,7 +25,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -405,6 +405,19 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
ret void
}
+; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
+; CI: s_endpgm
+define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+ %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
+ %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
+ %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
+ store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
+ ret void
+}
+
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tgid.x() #1
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 1d9d881c5c7e..358aa6a9e363 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -1,9 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
@lds = addrspace(3) global [512 x float] undef, align 4
-
; SI-LABEL: @simple_write2st64_one_val_f32_0_1
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
index ae84d841021d..600f0cb83578 100644
--- a/test/CodeGen/AMDGPU/fmuladd.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
; CHECK-LABEL: {{^}}fmuladd_f32:
-; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; CHECK: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
float addrspace(1)* %in2, float addrspace(1)* %in3) {
@@ -34,8 +34,8 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -53,8 +53,8 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -72,8 +72,8 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fadd_a_a_b_f32(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
@@ -94,8 +94,8 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fadd_b_a_a_f32(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
@@ -116,8 +116,8 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -136,8 +136,8 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -158,8 +158,8 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
new file mode 100644
index 000000000000..2a01a621fc42
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GatherAllAliases gives up on trying to analyze cases where the
+; pointer may have been loaded from an aliased store, so make sure
+; that this works and allows moving the stores to a better chain to
+; allow them to be merged merged when it's clear the pointer is loaded
+; from constant/invariant memory.
+
+; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load:
+; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN: buffer_store_dword [[K]], [[PTR]]
+define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+ %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
+ %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
+ store i16 123, i16 addrspace(1)* %ptr, align 4
+ store i16 456, i16 addrspace(1)* %ptr.1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load:
+; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN: buffer_store_dword [[K]], s{{\[}}[[SPTR_LO]]:
+define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+ %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
+ %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
+ store i16 123, i16 addrspace(1)* %ptr, align 4
+ store i16 456, i16 addrspace(1)* %ptr.1
+ ret void
+}
+
+!0 = !{}
+
+attributes #0 = { nounwind } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
index 4e4c2ec7791a..a64dd0ebd2dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
@@ -5,7 +5,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
; FUNC-LABEL: {{^}}test_lrp:
; SI: v_sub_f32
-; SI: v_mad_f32
+; SI: v_mac_f32_e32
define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
%mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
store float %mad, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index f5f124d915a5..d0e49243ffa7 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -9,8 +9,8 @@
; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
-; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
-; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]|
+; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
; SI: buffer_store_dword [[RESULT]]
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index bc071628ead0..c98f851f2b93 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -19,7 +19,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
@@ -29,7 +29,8 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
-; SI: buffer_store_dword [[RESULT]]
+; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
%tid = tail call i32 @llvm.r600.read.tidig.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -54,8 +55,8 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
@@ -64,8 +65,10 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
%tid = tail call i32 @llvm.r600.read.tidig.x() #0
@@ -96,13 +99,14 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
-; SI: buffer_store_dword [[RESULT]]
+; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
%tid = tail call i32 @llvm.r600.read.tidig.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -482,7 +486,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]]
; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
@@ -492,7 +496,8 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
%tid = tail call i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
index aa4194ff6106..24ff23a4cfc1 100644
--- a/test/CodeGen/AMDGPU/mad-sub.ll
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -123,7 +123,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
}
; FUNC-LABEL: {{^}}neg_neg_mad_f32:
-; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.r600.read.tidig.x() #0
%tid.ext = sext i32 %tid to i64
@@ -172,8 +172,8 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; SI: buffer_store_dword [[RESULT]]
+; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; SI: buffer_store_dword [[R2]]
define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 933bb016d2c9..2e90cf10a3b5 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; GCN-LABEL: {{^}}madak_f32:
; GCN: buffer_load_dword [[VA:v[0-9]+]]
; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -34,8 +34,8 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
; GCN: s_endpgm
define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -105,7 +105,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -124,7 +124,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -140,7 +140,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add
; GCN-LABEL: {{^}}s_s_madak_f32:
; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
%mul = fmul float %a, %b
%madak = fadd float %mul, 10.0
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index ba7bb221a99a..f8e14e34af67 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -28,8 +28,8 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
; GCN: s_endpgm
define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -59,7 +59,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
; GCN-LABEL: {{^}}madmk_inline_imm_f32:
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -77,7 +77,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
; GCN-LABEL: {{^}}s_s_madmk_f32:
; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
+; GCN: v_mac_f32_e32
; GCN: s_endpgm
define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -107,7 +107,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
+; GCN: v_mac_f32_e32
; GCN: s_endpgm
define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
index e640a7cd69f6..8a0e71d739be 100644
--- a/test/CodeGen/AMDGPU/mul_uint24.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -52,16 +52,18 @@ entry:
; FUNC_LABEL: {{^}}mul24_i64:
; EG; MUL_UINT24
; EG: MULHI
-; SI: v_mul_u32_u24
; FIXME: SI support 24-bit mulhi
-; SI: v_mul_hi_u32
-define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+
+; SI-DAG: v_mul_u32_u24
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
entry:
- %0 = shl i64 %a, 40
- %a_24 = lshr i64 %0, 40
- %1 = shl i64 %b, 40
- %b_24 = lshr i64 %1, 40
- %2 = mul i64 %a_24, %b_24
- store i64 %2, i64 addrspace(1)* %out
+ %tmp0 = shl i64 %a, 40
+ %a_24 = lshr i64 %tmp0, 40
+ %tmp1 = shl i64 %b, 40
+ %b_24 = lshr i64 %tmp1, 40
+ %tmp2 = mul i64 %a_24, %b_24
+ store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 59082c65cc8a..94758ad84c18 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -6,10 +6,10 @@
; FUNC-LABEL: {{^}}select_v4i8:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
%cmp = icmp eq i8 %c, 0
%select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
@@ -18,10 +18,10 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
}
; FUNC-LABEL: {{^}}select_v4i16:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
@@ -30,8 +30,8 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
}
; FUNC-LABEL: {{^}}select_v2i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx2
define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
@@ -41,10 +41,10 @@ define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32>
}
; FUNC-LABEL: {{^}}select_v4i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx4
define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
@@ -54,14 +54,14 @@ define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32>
}
; FUNC-LABEL: {{^}}select_v8i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
@@ -88,14 +88,14 @@ define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f
}
; FUNC-LABEL: {{^}}select_v8f32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <8 x float> %a, <8 x float> %b
@@ -104,10 +104,10 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
}
; FUNC-LABEL: {{^}}select_v2f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <2 x double> %a, <2 x double> %b
@@ -116,14 +116,14 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
}
; FUNC-LABEL: {{^}}select_v4f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x double> %a, <4 x double> %b
@@ -132,22 +132,22 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
}
; FUNC-LABEL: {{^}}select_v8f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <8 x double> %a, <8 x double> %b
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
index 5cebb30dc72e..13fb575b2b15 100644
--- a/test/CodeGen/AMDGPU/select64.ll
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -55,8 +55,8 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
; CHECK: s_endpgm
define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
%cmp = icmp ugt i32 %cond, 5
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 53b63dc4b8ad..6f81a39ed96a 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
;EG: {{^}}shl_v2i32:
;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -178,3 +181,32 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
ret void
}
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_shl_32_i64:
+; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+ %result = shl i64 %a, 32
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_32_i64:
+; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
+define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.in
+ %result = shl i64 %a, 32
+ store i64 %result, i64 addrspace(1)* %gep.out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index da4e91db3a38..0db7cdc171b5 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -12,11 +12,11 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
; FIXME: select on 0, 0
; SI-LABEL: {{^}}sint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI: v_cmp_eq_i32_e64 vcc,
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; uses an SGPR (implicit vcc).
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 4904d7fa1bd0..0dad91e709d9 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,7 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+declare i32 @llvm.r600.read.tidig.x() #0
+
; FUNC-LABEL: {{^}}lshr_i32:
; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -184,3 +186,32 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
ret void
}
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_lshr_32_i64:
+; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
+; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+ %result = lshr i64 %a, 32
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_32_i64:
+; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
+define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.in
+ %result = lshr i64 %a, 32
+ store i64 %result, i64 addrspace(1)* %gep.out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index dfec8eb15cb7..6f608df5e9f5 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -72,11 +72,11 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
; FIXME: select on 0, 0
; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: v_cmp_eq_i32_e64 vcc
+; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
+; uses an SGPR (implicit vcc).
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
new file mode 100644
index 000000000000..a4eaec3403c9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -0,0 +1,155 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}mac_vvv:
+; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4
+; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN: buffer_store_dword [[C]]
+define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
+ %tmp0 = fmul float %a, %b
+ %tmp1 = fadd float %tmp0, %c
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5
+define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) {
+entry:
+ %tmp0 = fmul float 0.5, %in
+ %tmp1 = fadd float %tmp0, 0.5
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mad_vvs:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+
+ %tmp0 = fmul float %a, %b
+ %tmp1 = fadd float %tmp0, %c
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mac_ssv:
+; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
+entry:
+ %c = load float, float addrspace(1)* %in
+
+ %tmp0 = fmul float %a, %a
+ %tmp1 = fadd float %tmp0, %c
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mac_mad_same_add:
+; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
+; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
+define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+ %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
+ %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+ %d = load float, float addrspace(1)* %d_ptr
+ %e = load float, float addrspace(1)* %e_ptr
+
+ %tmp0 = fmul float %a, %b
+ %tmp1 = fadd float %tmp0, %c
+
+ %tmp2 = fmul float %d, %e
+ %tmp3 = fadd float %tmp2, %c
+
+ %out1 = getelementptr float, float addrspace(1)* %out, i32 1
+ store float %tmp1, float addrspace(1)* %out
+ store float %tmp3, float addrspace(1)* %out1
+ ret void
+}
+
+; There is no advantage to using v_mac when one of the operands is negated
+; and v_mad accepts more operand types.
+
+; GCN-LABEL: {{^}}mad_neg_src0:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
+ %neg_a = fsub float 0.0, %a
+ %tmp0 = fmul float %neg_a, %b
+ %tmp1 = fadd float %tmp0, %c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mad_neg_src1:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
+ %neg_b = fsub float 0.0, %b
+ %tmp0 = fmul float %a, %neg_b
+ %tmp1 = fadd float %tmp0, %c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}mad_neg_src2:
+; GCN-NOT: v_mac
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
+define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
+ %neg_c = fsub float 0.0, %c
+ %tmp0 = fmul float %a, %b
+ %tmp1 = fadd float %tmp0, %neg_c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { "true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index a3014b03d2b3..dc1f1ea11b01 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -1,14 +1,14 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_select_v2i32:
-;EG: {{^}}test_select_v2i32:
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: {{^}}test_select_v2i32:
-;SI: v_cndmask_b32_e64
;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e32
define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
entry:
@@ -20,13 +20,13 @@ entry:
ret void
}
-;EG: {{^}}test_select_v2f32:
+;FUNC-LABEL: {{^}}test_select_v2f32:
+
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: {{^}}test_select_v2f32:
-;SI: v_cndmask_b32_e64
;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e32
define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
entry:
@@ -38,17 +38,19 @@ entry:
ret void
}
-;EG: {{^}}test_select_v4i32:
+;FUNC-LABEL: {{^}}test_select_v4i32:
+
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: {{^}}test_select_v4i32:
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
+; FIXME: The shrinking does not happen on tonga
+
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
entry:
@@ -60,7 +62,7 @@ entry:
ret void
}
-;EG: {{^}}test_select_v4f32:
+;FUNC-LABEL: {{^}}test_select_v4f32:
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 089db59eabc7..ddb920af29d8 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -42,8 +42,8 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
index c93d2a2d34fb..ac5b6f9c9708 100644
--- a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
+++ b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
@@ -25,8 +25,7 @@ entry:
;CHECK: push {r7, lr}
;CHECK: sub sp, #4
;CHECK: add r0, sp, #12
- ;CHECK: str r2, [sp, #16]
- ;CHECK: str r1, [sp, #12]
+ ;CHECK: strd r1, r2, [sp, #12]
;CHECK: bl fooUseStruct
call void @fooUseStruct(%st_t* %p1)
ret void
diff --git a/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll b/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
index 438b021a040b..d3aa2331d45f 100644
--- a/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
+++ b/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
@@ -9,8 +9,8 @@ define void @foo(%struct4bytes* byval %p0, ; --> R0
) {
;CHECK: sub sp, sp, #16
;CHECK: push {r11, lr}
-;CHECK: add r11, sp, #8
-;CHECK: stm r11, {r0, r1, r2, r3}
+;CHECK: add r12, sp, #8
+;CHECK: stm r12, {r0, r1, r2, r3}
;CHECK: add r0, sp, #12
;CHECK: bl useInt
;CHECK: pop {r11, lr}
diff --git a/test/CodeGen/ARM/Windows/hard-float.ll b/test/CodeGen/ARM/Windows/hard-float.ll
index f7b7ec273ce8..1ce02813dfc2 100644
--- a/test/CodeGen/ARM/Windows/hard-float.ll
+++ b/test/CodeGen/ARM/Windows/hard-float.ll
@@ -1,4 +1,8 @@
-; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
+; RUN: | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple=thumbv7-windows-gnu -mcpu=cortex-a9 -o - %s \
+; RUN: | FileCheck %s -check-prefix CHECK-GNU
define float @function(float %f, float %g) nounwind {
entry:
@@ -6,5 +10,7 @@ entry:
ret float %h
}
-; CHECK: vadd.f32 s0, s0, s1
+; CHECK-WIN: vadd.f32 s0, s0, s1
+
+; CHECK-GNU: vadd.f32 s0, s0, s1
diff --git a/test/CodeGen/ARM/Windows/long-calls.ll b/test/CodeGen/ARM/Windows/long-calls.ll
index 21c95fac91c5..4e5bdce146f0 100644
--- a/test/CodeGen/ARM/Windows/long-calls.ll
+++ b/test/CodeGen/ARM/Windows/long-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -arm-long-calls -o - %s \
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -mattr=+long-calls -o - %s \
; RUN: | FileCheck %s
declare arm_aapcs_vfpcc void @callee()
diff --git a/test/CodeGen/ARM/Windows/no-arm-mode.ll b/test/CodeGen/ARM/Windows/no-arm-mode.ll
index 6db031fc9169..30353640a4cc 100644
--- a/test/CodeGen/ARM/Windows/no-arm-mode.ll
+++ b/test/CodeGen/ARM/Windows/no-arm-mode.ll
@@ -1,5 +1,10 @@
; RUN: not llc -mtriple=armv7-windows-itanium -mcpu=cortex-a9 -o /dev/null %s 2>&1 \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s -check-prefix CHECK-WIN
-; CHECK: does not support ARM mode execution
+; RUN: not llc -mtriple=armv7-windows-gnu -mcpu=cortex-a9 -o /dev/null %s 2>&1 \
+; RUN: | FileCheck %s -check-prefix CHECK-GNU
+
+; CHECK-WIN: does not support ARM mode execution
+
+; CHECK-GNU: does not support ARM mode execution
diff --git a/test/CodeGen/ARM/Windows/pic.ll b/test/CodeGen/ARM/Windows/pic.ll
index 9ef7c35c5530..df4c400035a3 100644
--- a/test/CodeGen/ARM/Windows/pic.ll
+++ b/test/CodeGen/ARM/Windows/pic.ll
@@ -1,5 +1,8 @@
; RUN: llc -mtriple thumbv7-windows-itanium -relocation-model pic -filetype asm -o - %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple thumbv7-windows-gnu -relocation-model pic -filetype asm -o - %s \
+; RUN: | FileCheck %s -check-prefix CHECK-GNU
@external = external global i8
@@ -9,8 +12,12 @@ entry:
ret i8 %0
}
-; CHECK-LABEL: return_external
-; CHECK: movw r0, :lower16:external
-; CHECK: movt r0, :upper16:external
-; CHECK: ldrb r0, [r0]
+; CHECK-WIN-LABEL: return_external
+; CHECK-WIN: movw r0, :lower16:external
+; CHECK-WIN: movt r0, :upper16:external
+; CHECK-WIN: ldrb r0, [r0]
+; CHECK-GNU-LABEL: return_external
+; CHECK-GNU: movw r0, :lower16:external
+; CHECK-GNU: movt r0, :upper16:external
+; CHECK-GNU: ldrb r0, [r0]
diff --git a/test/CodeGen/ARM/Windows/structors.ll b/test/CodeGen/ARM/Windows/structors.ll
index 874b5bf35b81..eff1c7f4b384 100644
--- a/test/CodeGen/ARM/Windows/structors.ll
+++ b/test/CodeGen/ARM/Windows/structors.ll
@@ -1,4 +1,8 @@
-; RUN: llc -mtriple thumbv7-windows-itanium -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-windows-itanium -o - %s \
+; RUN: | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple thumbv7-windows-gnu -o - %s \
+; RUN: | FileCheck %s -check-prefix CHECK-GNU
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @function, i8* null }]
@@ -7,6 +11,8 @@ entry:
ret void
}
-; CHECK: .section .CRT$XCU,"dr"
-; CHECK: .long function
+; CHECK-WIN: .section .CRT$XCU,"dr"
+; CHECK-WIN: .long function
+; CHECK-GNU: .section .ctors,"dw"
+; CHECK-GNU: .long function
diff --git a/test/CodeGen/ARM/Windows/trivial-gnu-object.ll b/test/CodeGen/ARM/Windows/trivial-gnu-object.ll
new file mode 100644
index 000000000000..a242f39601cb
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/trivial-gnu-object.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-windows-gnu -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
+
+define void @foo() {
+; CHECK: file format COFF-ARM
+
+; CHECK-LABEL: foo:
+; CHECK: bx lr
+ ret void
+}
diff --git a/test/CodeGen/ARM/arm-returnaddr.ll b/test/CodeGen/ARM/arm-returnaddr.ll
index 4266572b077f..26f8c67bb15a 100644
--- a/test/CodeGen/ARM/arm-returnaddr.ll
+++ b/test/CodeGen/ARM/arm-returnaddr.ll
@@ -8,7 +8,6 @@
define i8* @rt0(i32 %x) nounwind readnone {
entry:
; CHECK-LABEL: rt0:
-; CHECK: {r7, lr}
; CHECK: mov r0, lr
%0 = tail call i8* @llvm.returnaddress(i32 0)
ret i8* %0
@@ -17,10 +16,9 @@ entry:
define i8* @rt2() nounwind readnone {
entry:
; CHECK-LABEL: rt2:
-; CHECK: {r7, lr}
; CHECK: ldr r[[R0:[0-9]+]], [r7]
-; CHECK: ldr r0, [r0]
-; CHECK: ldr r0, [r0, #4]
+; CHECK: ldr r0, [r[[R0]]]
+; CHECK: ldr r0, [r[[R0]], #4]
%0 = tail call i8* @llvm.returnaddress(i32 2)
ret i8* %0
}
diff --git a/test/CodeGen/ARM/byval-align.ll b/test/CodeGen/ARM/byval-align.ll
index a26b5a795756..8a506280dd57 100644
--- a/test/CodeGen/ARM/byval-align.ll
+++ b/test/CodeGen/ARM/byval-align.ll
@@ -28,8 +28,7 @@ define i32 @test_align8(i8*, [4 x i32]* byval align 8 %b) {
; CHECK: push {r4, r7, lr}
; CHECK: add r7, sp, #4
-; CHECK-DAG: str r2, [r7, #8]
-; CHECK-DAG: str r3, [r7, #12]
+; CHECK: strd r2, r3, [r7, #8]
; CHECK: ldr r0, [r7, #8]
diff --git a/test/CodeGen/ARM/cttz.ll b/test/CodeGen/ARM/cttz.ll
new file mode 100644
index 000000000000..dacfca505931
--- /dev/null
+++ b/test/CodeGen/ARM/cttz.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s
+
+; This test checks the @llvm.cttz.* intrinsics for integers.
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+;------------------------------------------------------------------------------
+
+define i8 @test_i8(i8 %a) {
+; CHECK-LABEL: test_i8:
+; CHECK: orr [[REG:r[0-9]+]], [[REG]], #256
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+ ret i8 %tmp
+}
+
+define i16 @test_i16(i16 %a) {
+; CHECK-LABEL: test_i16:
+; CHECK: orr [[REG:r[0-9]+]], [[REG]], #65536
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+ ret i16 %tmp
+}
+
+define i32 @test_i32(i32 %a) {
+; CHECK-LABEL: test_i32:
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+ ret i32 %tmp
+}
+
+define i64 @test_i64(i64 %a) {
+; CHECK-LABEL: test_i64:
+; CHECK: rbit
+; CHECK: rbit
+; CHECK: cmp
+; CHECK: clz
+; CHECK: add
+; CHECK: clzne
+ %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+ ret i64 %tmp
+}
+
+;------------------------------------------------------------------------------
+
+define i8 @test_i8_zero_undef(i8 %a) {
+; CHECK-LABEL: test_i8_zero_undef:
+; CHECK-NOT: orr
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 true)
+ ret i8 %tmp
+}
+
+define i16 @test_i16_zero_undef(i16 %a) {
+; CHECK-LABEL: test_i16_zero_undef:
+; CHECK-NOT: orr
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 true)
+ ret i16 %tmp
+}
+
+
+define i32 @test_i32_zero_undef(i32 %a) {
+; CHECK-LABEL: test_i32_zero_undef:
+; CHECK: rbit
+; CHECK: clz
+ %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+ ret i32 %tmp
+}
+
+define i64 @test_i64_zero_undef(i64 %a) {
+; CHECK-LABEL: test_i64_zero_undef:
+; CHECK: rbit
+; CHECK: rbit
+; CHECK: cmp
+; CHECK: clz
+; CHECK: add
+; CHECK: clzne
+ %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 true)
+ ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM/cttz_vector.ll b/test/CodeGen/ARM/cttz_vector.ll
new file mode 100644
index 000000000000..9480d75db47a
--- /dev/null
+++ b/test/CodeGen/ARM/cttz_vector.ll
@@ -0,0 +1,383 @@
+; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
+
+; This test checks the @llvm.cttz.* intrinsics for vectors.
+
+declare <1 x i8> @llvm.cttz.v1i8(<1 x i8>, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.cttz.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
+
+declare <1 x i16> @llvm.cttz.v1i16(<1 x i16>, i1)
+declare <2 x i16> @llvm.cttz.v2i16(<2 x i16>, i1)
+declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>, i1)
+declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
+
+declare <1 x i32> @llvm.cttz.v1i32(<1 x i32>, i1)
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1)
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
+
+declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>, i1)
+declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8
+ %a = load <1 x i8>, <1 x i8>* %p
+ %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
+ store <1 x i8> %tmp, <1 x i8>* %p
+ ret void
+}
+
+define void @test_v2i8(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8:
+ %a = load <2 x i8>, <2 x i8>* %p
+ %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
+ store <2 x i8> %tmp, <2 x i8>* %p
+ ret void
+}
+
+define void @test_v4i8(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8:
+ %a = load <4 x i8>, <4 x i8>* %p
+ %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
+ store <4 x i8> %tmp, <4 x i8>* %p
+ ret void
+}
+
+define void @test_v8i8(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <8 x i8>, <8 x i8>* %p
+ %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
+ store <8 x i8> %tmp, <8 x i8>* %p
+ ret void
+}
+
+define void @test_v16i8(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <16 x i8>, <16 x i8>* %p
+ %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+ store <16 x i8> %tmp, <16 x i8>* %p
+ ret void
+}
+
+define void @test_v1i16(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16:
+ %a = load <1 x i16>, <1 x i16>* %p
+ %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
+ store <1 x i16> %tmp, <1 x i16>* %p
+ ret void
+}
+
+define void @test_v2i16(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16:
+ %a = load <2 x i16>, <2 x i16>* %p
+ %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
+ store <2 x i16> %tmp, <2 x i16>* %p
+ ret void
+}
+
+define void @test_v4i16(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i16 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s16 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i16 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <4 x i16>, <4 x i16>* %p
+ %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
+ store <4 x i16> %tmp, <4 x i16>* %p
+ ret void
+}
+
+define void @test_v8i16(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i16 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s16 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i16 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <8 x i16>, <8 x i16>* %p
+ %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+ store <8 x i16> %tmp, <8 x i16>* %p
+ ret void
+}
+
+define void @test_v1i32(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32:
+ %a = load <1 x i32>, <1 x i32>* %p
+ %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
+ store <1 x i32> %tmp, <1 x i32>* %p
+ ret void
+}
+
+define void @test_v2i32(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s32 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i32 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <2 x i32>, <2 x i32>* %p
+ %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+ store <2 x i32> %tmp, <2 x i32>* %p
+ ret void
+}
+
+define void @test_v4i32(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s32 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i32 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <4 x i32>, <4 x i32>* %p
+ %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+ store <4 x i32> %tmp, <4 x i32>* %p
+ ret void
+}
+
+define void @test_v1i64(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vpaddl.u32 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <1 x i64>, <1 x i64>* %p
+ %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
+ store <1 x i64> %tmp, <1 x i64>* %p
+ ret void
+}
+
+define void @test_v2i64(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u32 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <2 x i64>, <2 x i64>* %p
+ %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+ store <2 x i64> %tmp, <2 x i64>* %p
+ ret void
+}
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8_zero_undef(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8_zero_undef
+ %a = load <1 x i8>, <1 x i8>* %p
+ %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
+ store <1 x i8> %tmp, <1 x i8>* %p
+ ret void
+}
+
+define void @test_v2i8_zero_undef(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8_zero_undef:
+ %a = load <2 x i8>, <2 x i8>* %p
+ %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
+ store <2 x i8> %tmp, <2 x i8>* %p
+ ret void
+}
+
+define void @test_v4i8_zero_undef(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8_zero_undef:
+ %a = load <4 x i8>, <4 x i8>* %p
+ %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
+ store <4 x i8> %tmp, <4 x i8>* %p
+ ret void
+}
+
+define void @test_v8i8_zero_undef(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <8 x i8>, <8 x i8>* %p
+ %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
+ store <8 x i8> %tmp, <8 x i8>* %p
+ ret void
+}
+
+define void @test_v16i8_zero_undef(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <16 x i8>, <16 x i8>* %p
+ %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+ store <16 x i8> %tmp, <16 x i8>* %p
+ ret void
+}
+
+define void @test_v1i16_zero_undef(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16_zero_undef:
+ %a = load <1 x i16>, <1 x i16>* %p
+ %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
+ store <1 x i16> %tmp, <1 x i16>* %p
+ ret void
+}
+
+define void @test_v2i16_zero_undef(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16_zero_undef:
+ %a = load <2 x i16>, <2 x i16>* %p
+ %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
+ store <2 x i16> %tmp, <2 x i16>* %p
+ ret void
+}
+
+define void @test_v4i16_zero_undef(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s16 [[D2:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i16 [[D3:d[0-9]+]], #0xf
+; CHECK: vclz.i16 [[D1]], [[D1]]
+; CHECK: vsub.i16 [[D1]], [[D3]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <4 x i16>, <4 x i16>* %p
+ %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
+ store <4 x i16> %tmp, <4 x i16>* %p
+ ret void
+}
+
+define void @test_v8i16_zero_undef(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s16 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i16 [[Q3:q[0-9]+]], #0xf
+; CHECK: vclz.i16 [[Q1]], [[Q1]]
+; CHECK: vsub.i16 [[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <8 x i16>, <8 x i16>* %p
+ %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+ store <8 x i16> %tmp, <8 x i16>* %p
+ ret void
+}
+
+define void @test_v1i32_zero_undef(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32_zero_undef:
+ %a = load <1 x i32>, <1 x i32>* %p
+ %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
+ store <1 x i32> %tmp, <1 x i32>* %p
+ ret void
+}
+
+define void @test_v2i32_zero_undef(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s32 [[D2:d[0-9]+]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i32 [[D3:d[0-9]+]], #0x1f
+; CHECK: vclz.i32 [[D1]], [[D1]]
+; CHECK: vsub.i32 [[D1]], [[D3]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <2 x i32>, <2 x i32>* %p
+ %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
+ store <2 x i32> %tmp, <2 x i32>* %p
+ ret void
+}
+
+define void @test_v4i32_zero_undef(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s32 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i32 [[Q3:q[0-9]+]], #0x1f
+; CHECK: vclz.i32 [[Q1]], [[Q1]]
+; CHECK: vsub.i32 [[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <4 x i32>, <4 x i32>* %p
+ %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+ store <4 x i32> %tmp, <4 x i32>* %p
+ ret void
+}
+
+define void @test_v1i64_zero_undef(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64_zero_undef:
+; CHECK: vldr [[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]]
+; CHECK: vand [[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64 [[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8 [[D1]], [[D1]]
+; CHECK: vpaddl.u8 [[D1]], [[D1]]
+; CHECK: vpaddl.u16 [[D1]], [[D1]]
+; CHECK: vpaddl.u32 [[D1]], [[D1]]
+; CHECK: vstr [[D1]], [r0]
+ %a = load <1 x i64>, <1 x i64>* %p
+ %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
+ store <1 x i64> %tmp, <1 x i64>* %p
+ ret void
+}
+
+define void @test_v2i64_zero_undef(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64_zero_undef:
+; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand [[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64 [[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u8 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u16 [[Q1]], [[Q1]]
+; CHECK: vpaddl.u32 [[Q1]], [[Q1]]
+; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0]
+ %a = load <2 x i64>, <2 x i64>* %p
+ %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+ store <2 x i64> %tmp, <2 x i64>* %p
+ ret void
+}
diff --git a/test/CodeGen/ARM/ctz.ll b/test/CodeGen/ARM/ctz.ll
deleted file mode 100644
index 2d88b0351cf3..000000000000
--- a/test/CodeGen/ARM/ctz.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
-
-declare i32 @llvm.cttz.i32(i32, i1)
-
-define i32 @f1(i32 %a) {
-; CHECK-LABEL: f1:
-; CHECK: rbit
-; CHECK: clz
- %tmp = call i32 @llvm.cttz.i32( i32 %a, i1 true )
- ret i32 %tmp
-}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index bd170f30d979..e382e78a9950 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,9 +1,9 @@
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=THUMB-LONG
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 6b434b74ca79..1c7ff6879386 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,9 +1,9 @@
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
; Note that some of these tests assume that relocations are either
; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index c3980cb51f67..200387cf8926 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -mattr=+long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -mattr=+long-calls | FileCheck -check-prefix=CHECK-LONG %s
; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index f3e13671ac37..56cdcaedf900 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -3,6 +3,7 @@
; rdar://6949835
; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
; Magic ARM pair hints works best with linearscan / fast.
@@ -110,5 +111,73 @@ entry:
ret void
}
+; CHECK-LABEL: strd_spill_ldrd_reload:
+; A8: strd r1, r0, [sp, #-8]!
+; M3: strd r1, r0, [sp, #-8]!
+; BASIC: strd r1, r0, [sp, #-8]!
+; GREEDY: strd r0, r1, [sp, #-8]!
+; CHECK: @ InlineAsm Start
+; CHECK: @ InlineAsm End
+; A8: ldrd r2, r1, [sp]
+; M3: ldrd r2, r1, [sp]
+; BASIC: ldrd r2, r1, [sp]
+; GREEDY: ldrd r1, r2, [sp]
+; CHECK: bl{{x?}} _extfunc
+define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
+ ; force %v0 and %v1 to be spilled
+ call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"()
+ ; force the reloaded %v0, %v1 into different registers
+ call void @extfunc(i32 0, i32 %v0, i32 %v1, i32 7)
+ ret void
+}
+
+declare void @extfunc2(i32*, i32, i32)
+
+; CHECK-LABEL: ldrd_postupdate_dec:
+; CHECK: ldrd r1, r2, [r0], #-8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_dec(i32* %p0) {
+ %p0.1 = getelementptr i32, i32* %p0, i32 1
+ %v0 = load i32, i32* %p0
+ %v1 = load i32, i32* %p0.1
+ %p1 = getelementptr i32, i32* %p0, i32 -2
+ call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+ ret void
+}
+
+; CHECK-LABEL: ldrd_postupdate_inc:
+; CHECK: ldrd r1, r2, [r0], #8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_inc(i32* %p0) {
+ %p0.1 = getelementptr i32, i32* %p0, i32 1
+ %v0 = load i32, i32* %p0
+ %v1 = load i32, i32* %p0.1
+ %p1 = getelementptr i32, i32* %p0, i32 2
+ call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+ ret void
+}
+
+; CHECK-LABEL: strd_postupdate_dec:
+; CHECK: strd r1, r2, [r0], #-8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
+ %p0.1 = getelementptr i32, i32* %p0, i32 1
+ store i32 %v0, i32* %p0
+ store i32 %v1, i32* %p0.1
+ %p1 = getelementptr i32, i32* %p0, i32 -2
+ ret i32* %p1
+}
+
+; CHECK-LABEL: strd_postupdate_inc:
+; CHECK: strd r1, r2, [r0], #8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
+ %p0.1 = getelementptr i32, i32* %p0, i32 1
+ store i32 %v0, i32* %p0
+ store i32 %v1, i32* %p0.1
+ %p1 = getelementptr i32, i32* %p0, i32 2
+ ret i32* %p1
+}
+
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index 191db1e20a25..f6f8d5623509 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -4,8 +4,7 @@ define void @t1(i8* nocapture %c) nounwind optsize {
entry:
; CHECK-LABEL: t1:
; CHECK: movs r1, #0
-; CHECK: str r1, [r0]
-; CHECK: str r1, [r0, #4]
+; CHECK: strd r1, r1, [r0]
; CHECK: str r1, [r0, #8]
call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
ret void
diff --git a/test/CodeGen/ARM/nest-register.ll b/test/CodeGen/ARM/nest-register.ll
new file mode 100644
index 000000000000..6b8c3dc47db1
--- /dev/null
+++ b/test/CodeGen/ARM/nest-register.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register.
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: @ BB#0:
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: mov pc, lr
+ ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mov r12, r0
+; CHECK-NEXT: bl nest_receiver
+; CHECK: mov pc, lr
+ %result = call i8* @nest_receiver(i8* nest %arg)
+ ret i8* %result
+}
diff --git a/test/CodeGen/ARM/subtarget-features-long-calls.ll b/test/CodeGen/ARM/subtarget-features-long-calls.ll
new file mode 100644
index 000000000000..430ae3d13307
--- /dev/null
+++ b/test/CodeGen/ARM/subtarget-features-long-calls.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=+long-calls | FileCheck -check-prefix=LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=-long-calls | FileCheck -check-prefix=NO-LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=+long-calls | FileCheck -check-prefix=LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=-long-calls | FileCheck -check-prefix=NO-LONGCALL %s
+
+; NO-OPTION-LABEL: {{_?}}caller0
+; NO-OPTION: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; NO-OPTION: blx [[R0]]
+; NO-OPTION: [[L0]]:
+; NO-OPTION: .long {{_?}}callee0
+
+; LONGCALL-LABEL: {{_?}}caller0
+; LONGCALL: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; LONGCALL: blx [[R0]]
+; LONGCALL: [[L0]]:
+; LONGCALL: .long {{_?}}callee0
+
+; NO-LONGCALL-LABEL: {{_?}}caller0
+; NO-LONGCALL: bl {{_?}}callee0
+
+define i32 @caller0() #0 {
+entry:
+ tail call void @callee0()
+ ret i32 0
+}
+
+; NO-OPTION-LABEL: {{_?}}caller1
+; NO-OPTION: bl {{_?}}callee0
+
+; LONGCALL-LABEL: {{_?}}caller1
+; LONGCALL: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; LONGCALL: blx [[R0]]
+; LONGCALL: [[L0]]:
+; LONGCALL: .long {{_?}}callee0
+
+; NO-LONGCALL-LABEL: {{_?}}caller1
+; NO-LONGCALL: bl {{_?}}callee0
+
+define i32 @caller1() {
+entry:
+ tail call void @callee0()
+ ret i32 0
+}
+
+declare void @callee0()
+
+attributes #0 = { "target-features"="+long-calls" }
diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
index 96c5fb8961ef..fe335df7a1ad 100644
--- a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
+++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
@@ -5,16 +5,20 @@ target triple = "thumbv7--linux-gnueabi"
declare i8* @llvm.returnaddress(i32)
-define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize {
+define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0, i32 %val1) minsize {
store i32 %val0, i32* %addr
%addr1 = getelementptr i32, i32* %addr, i32 1
+ %addr2 = getelementptr i32, i32* %addr, i32 2
%lr = call i8* @llvm.returnaddress(i32 0)
%lr32 = ptrtoint i8* %lr to i32
- store i32 %lr32, i32* %addr1
- %addr2 = getelementptr i32, i32* %addr1, i32 1
- ret i32* %addr2
+ store i32 %val1, i32* %addr1
+ store i32 %lr32, i32* %addr2
+
+ %addr3 = getelementptr i32, i32* %addr, i32 3
+ ret i32* %addr3
}
-; Check that stm writes two registers. The bug caused one of registers (LR,
+; Check that stm writes three registers. The bug caused one of registers (LR,
; which invalid for Thumb1 form of STMIA instruction) to be dropped.
-; CHECK: stm{{[^,]*}}, {{{.*,.*}}}
+; CHECK-LABEL: wrong-t2stmia-size-reduction:
+; CHECK: stm{{[^,]*}}, {{{.*,.*,.*}}}
diff --git a/test/CodeGen/Generic/run-pass.ll b/test/CodeGen/Generic/run-pass.ll
new file mode 100644
index 000000000000..55d62ec18648
--- /dev/null
+++ b/test/CodeGen/Generic/run-pass.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -debug-pass=Structure -run-pass=gc-lowering -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: -gc-lowering
+; CHECK: FunctionPass Manager
+; CHECK-NEXT: Lower Garbage Collection Instructions
+; CHECK-NEXT: Machine Function Analysis
+; CHECK-NEXT: MIR Printing Pass
diff --git a/test/CodeGen/Hexagon/Atomics.ll b/test/CodeGen/Hexagon/Atomics.ll
new file mode 100644
index 000000000000..bbac5d73c868
--- /dev/null
+++ b/test/CodeGen/Hexagon/Atomics.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -march=hexagon
+
+@si = common global i32 0, align 4
+@sll = common global i64 0, align 8
+
+define void @test_op_ignore() nounwind {
+entry:
+ %t00 = atomicrmw add i32* @si, i32 1 monotonic
+ %t01 = atomicrmw add i64* @sll, i64 1 monotonic
+ %t10 = atomicrmw sub i32* @si, i32 1 monotonic
+ %t11 = atomicrmw sub i64* @sll, i64 1 monotonic
+ %t20 = atomicrmw or i32* @si, i32 1 monotonic
+ %t21 = atomicrmw or i64* @sll, i64 1 monotonic
+ %t30 = atomicrmw xor i32* @si, i32 1 monotonic
+ %t31 = atomicrmw xor i64* @sll, i64 1 monotonic
+ %t40 = atomicrmw and i32* @si, i32 1 monotonic
+ %t41 = atomicrmw and i64* @sll, i64 1 monotonic
+ %t50 = atomicrmw nand i32* @si, i32 1 monotonic
+ %t51 = atomicrmw nand i64* @sll, i64 1 monotonic
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define void @test_fetch_and_op() nounwind {
+entry:
+ %t00 = atomicrmw add i32* @si, i32 11 monotonic
+ store i32 %t00, i32* @si, align 4
+ %t01 = atomicrmw add i64* @sll, i64 11 monotonic
+ store i64 %t01, i64* @sll, align 8
+ %t10 = atomicrmw sub i32* @si, i32 11 monotonic
+ store i32 %t10, i32* @si, align 4
+ %t11 = atomicrmw sub i64* @sll, i64 11 monotonic
+ store i64 %t11, i64* @sll, align 8
+ %t20 = atomicrmw or i32* @si, i32 11 monotonic
+ store i32 %t20, i32* @si, align 4
+ %t21 = atomicrmw or i64* @sll, i64 11 monotonic
+ store i64 %t21, i64* @sll, align 8
+ %t30 = atomicrmw xor i32* @si, i32 11 monotonic
+ store i32 %t30, i32* @si, align 4
+ %t31 = atomicrmw xor i64* @sll, i64 11 monotonic
+ store i64 %t31, i64* @sll, align 8
+ %t40 = atomicrmw and i32* @si, i32 11 monotonic
+ store i32 %t40, i32* @si, align 4
+ %t41 = atomicrmw and i64* @sll, i64 11 monotonic
+ store i64 %t41, i64* @sll, align 8
+ %t50 = atomicrmw nand i32* @si, i32 11 monotonic
+ store i32 %t50, i32* @si, align 4
+ %t51 = atomicrmw nand i64* @sll, i64 11 monotonic
+ store i64 %t51, i64* @sll, align 8
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define void @test_lock() nounwind {
+entry:
+ %t00 = atomicrmw xchg i32* @si, i32 1 monotonic
+ store i32 %t00, i32* @si, align 4
+ %t01 = atomicrmw xchg i64* @sll, i64 1 monotonic
+ store i64 %t01, i64* @sll, align 8
+ fence seq_cst
+ store volatile i32 0, i32* @si, align 4
+ store volatile i64 0, i64* @sll, align 8
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
diff --git a/test/CodeGen/Hexagon/common-gep-basic.ll b/test/CodeGen/Hexagon/common-gep-basic.ll
new file mode 100644
index 000000000000..317bf868d0f8
--- /dev/null
+++ b/test/CodeGen/Hexagon/common-gep-basic.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: mpyi
+; CHECK-NOT: mpyi
+; The mpyis from the two GEPs should be commoned out.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.s_t = type { %struct.anon, i32 }
+%struct.anon = type { i32, [5 x i32] }
+
+@g = common global [100 x %struct.s_t] zeroinitializer, align 8
+
+; Function Attrs: nounwind
+define void @foo(i32 %x) #0 {
+entry:
+ %cmp = icmp slt i32 %x, 90
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ %arrayidx1 = getelementptr inbounds [100 x %struct.s_t], [100 x %struct.s_t]* @g, i32 0, i32 %x, i32 0, i32 1, i32 2
+ tail call void @bar(i32* %arrayidx1) #0
+ br label %if.end
+
+if.else: ; preds = %entry
+ %arrayidx5 = getelementptr inbounds [100 x %struct.s_t], [100 x %struct.s_t]* @g, i32 0, i32 %x, i32 0, i32 1, i32 3
+ tail call void @bar(i32* %arrayidx5) #0
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ ret void
+}
+
+declare void @bar(i32*) #0
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/Hexagon/common-gep-icm.ll b/test/CodeGen/Hexagon/common-gep-icm.ll
new file mode 100644
index 000000000000..bc5719dfe1d0
--- /dev/null
+++ b/test/CodeGen/Hexagon/common-gep-icm.ll
@@ -0,0 +1,76 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; Rely on the comments generated by llc. Make sure there are no add/addasl
+; instructions in while.body13 (before the loads).
+; CHECK: while.body13
+; CHECK-NOT: add
+; CHECK: memw
+
+%struct.1 = type { i32, i32 }
+%struct.2 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %struct.1], [5 x i32] }
+
+@A1 = global i64 zeroinitializer
+@A2 = global i64 zeroinitializer
+@B1 = global i32 zeroinitializer
+@B2 = global i32 zeroinitializer
+@C1 = global i8 zeroinitializer
+
+declare i32 @llvm.hexagon.S2.cl0(i32) nounwind readnone
+declare i32 @llvm.hexagon.S2.setbit.r(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmpy2s.s0(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmac2s.s0(i64, i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.A2.vaddws(i64, i64) nounwind readnone
+declare i64 @llvm.hexagon.A2.vsubws(i64, i64) nounwind readnone
+declare i32 @llvm.hexagon.A4.modwrapu(i32, i32) nounwind readnone
+
+define void @foo(i32 %n) nounwind {
+entry:
+ br label %while.body
+
+while.body:
+ %count = phi i32 [ 0, %entry ], [ %next, %while.end ]
+ %idx = phi i32 [ 0, %entry ], [ %15, %while.end ]
+ %0 = load i32, i32* @B1, align 4
+ %1 = load i32, i32* @B2, align 8
+ %2 = and i32 %1, %0
+ br label %while.body13
+
+while.body13: ; preds = %while.body, %if.end
+ %3 = phi i64 [ %13, %if.end ], [ 0, %while.body ]
+ %4 = phi i64 [ %14, %if.end ], [ 0, %while.body ]
+ %m = phi i32 [ %6, %if.end ], [ %2, %while.body ]
+ %5 = tail call i32 @llvm.hexagon.S2.cl0(i32 %m)
+ %6 = tail call i32 @llvm.hexagon.S2.setbit.r(i32 %m, i32 %5)
+ %cgep85 = getelementptr [10 x %struct.2], [10 x %struct.2]* inttoptr (i32 -121502345 to [10 x %struct.2]*), i32 0, i32 %idx
+ %cgep90 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 12, i32 %5
+ %7 = load i32, i32* %cgep90, align 4
+ %8 = tail call i64 @llvm.hexagon.M2.vmpy2s.s0(i32 %7, i32 %7)
+ %cgep91 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 13, i32 %5
+ %9 = load i32, i32* %cgep91, align 4
+ %10 = tail call i64 @llvm.hexagon.M2.vmac2s.s0(i64 %8, i32 %9, i32 %9)
+ %11 = load i8, i8* @C1, align 1
+ %and24 = and i8 %11, 1
+ %cmp = icmp eq i8 %and24, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %while.body13
+ %12 = tail call i64 @llvm.hexagon.A2.vaddws(i64 %3, i64 %10)
+ store i64 %12, i64* @A1, align 8
+ br label %if.end
+
+if.end: ; preds = %if.then, %while.body13
+ %13 = phi i64 [ %12, %if.then ], [ %3, %while.body13 ]
+ %14 = tail call i64 @llvm.hexagon.A2.vsubws(i64 %4, i64 %10)
+ %tobool12 = icmp eq i32 %6, 0
+ br i1 %tobool12, label %while.end, label %while.body13
+
+while.end:
+ %add40 = add i32 %idx, 1
+ %15 = tail call i32 @llvm.hexagon.A4.modwrapu(i32 %add40, i32 10) nounwind
+ %next = add i32 %count, 1
+ %cc = icmp eq i32 %next, %n
+ br i1 %cc, label %end, label %while.body
+
+end:
+ store i64 %10, i64* @A2, align 8
+ ret void
+}
diff --git a/test/CodeGen/Hexagon/extract-basic.ll b/test/CodeGen/Hexagon/extract-basic.ll
new file mode 100644
index 000000000000..c75125cedd35
--- /dev/null
+++ b/test/CodeGen/Hexagon/extract-basic.ll
@@ -0,0 +1,76 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+
+; CHECK-DAG: extractu(r{{[0-9]*}}, #3, #4)
+; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #7)
+; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #16)
+
+; C source:
+; typedef struct {
+; unsigned x1:3;
+; unsigned x2:7;
+; unsigned x3:8;
+; unsigned x4:12;
+; unsigned x5:2;
+; } structx_t;
+;
+; typedef struct {
+; unsigned y1:4;
+; unsigned y2:3;
+; unsigned y3:9;
+; unsigned y4:8;
+; unsigned y5:8;
+; } structy_t;
+;
+; void foo(structx_t *px, structy_t *py) {
+; px->x1 = py->y1;
+; px->x2 = py->y2;
+; px->x3 = py->y3;
+; px->x4 = py->y4;
+; px->x5 = py->y5;
+; }
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.structx_t = type { i8, i8, i8, i8 }
+%struct.structy_t = type { i8, i8, i8, i8 }
+
+define void @foo(%struct.structx_t* nocapture %px, %struct.structy_t* nocapture %py) nounwind {
+entry:
+ %0 = bitcast %struct.structy_t* %py to i32*
+ %1 = load i32, i32* %0, align 4
+ %bf.value = and i32 %1, 7
+ %2 = bitcast %struct.structx_t* %px to i32*
+ %3 = load i32, i32* %2, align 4
+ %4 = and i32 %3, -8
+ %5 = or i32 %4, %bf.value
+ store i32 %5, i32* %2, align 4
+ %6 = load i32, i32* %0, align 4
+ %7 = lshr i32 %6, 4
+ %bf.clear1 = shl nuw nsw i32 %7, 3
+ %8 = and i32 %bf.clear1, 56
+ %9 = and i32 %5, -1017
+ %10 = or i32 %8, %9
+ store i32 %10, i32* %2, align 4
+ %11 = load i32, i32* %0, align 4
+ %12 = lshr i32 %11, 7
+ %bf.value4 = shl i32 %12, 10
+ %13 = and i32 %bf.value4, 261120
+ %14 = and i32 %10, -262081
+ %15 = or i32 %14, %13
+ store i32 %15, i32* %2, align 4
+ %16 = load i32, i32* %0, align 4
+ %17 = lshr i32 %16, 16
+ %bf.clear5 = shl i32 %17, 18
+ %18 = and i32 %bf.clear5, 66846720
+ %19 = and i32 %15, -1073480641
+ %20 = or i32 %19, %18
+ store i32 %20, i32* %2, align 4
+ %21 = load i32, i32* %0, align 4
+ %22 = lshr i32 %21, 24
+ %23 = shl i32 %22, 30
+ %24 = and i32 %20, 67107903
+ %25 = or i32 %24, %23
+ store i32 %25, i32* %2, align 4
+ ret void
+}
diff --git a/test/CodeGen/Hexagon/fusedandshift.ll b/test/CodeGen/Hexagon/fusedandshift.ll
index 59a1e1d84fcc..414574aec401 100644
--- a/test/CodeGen/Hexagon/fusedandshift.ll
+++ b/test/CodeGen/Hexagon/fusedandshift.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-extract=0 < %s | FileCheck %s
; Check that we generate fused logical and with shift instruction.
+; Disable "extract" generation, since it may eliminate the and/lsr.
; CHECK: r{{[0-9]+}} = and(#15, lsr(r{{[0-9]+}}, #{{[0-9]+}})
diff --git a/test/CodeGen/Hexagon/insert-basic.ll b/test/CodeGen/Hexagon/insert-basic.ll
new file mode 100644
index 000000000000..e941c063d9ed
--- /dev/null
+++ b/test/CodeGen/Hexagon/insert-basic.ll
@@ -0,0 +1,66 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK-DAG: insert(r{{[0-9]*}}, #17, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #18, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #22, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #12, #0)
+
+; C source:
+; typedef struct {
+; unsigned x1:23;
+; unsigned x2:17;
+; unsigned x3:18;
+; unsigned x4:22;
+; unsigned x5:12;
+; } structx_t;
+;
+; void foo(structx_t *px, int y1, int y2, int y3, int y4, int y5) {
+; px->x1 = y1;
+; px->x2 = y2;
+; px->x3 = y3;
+; px->x4 = y4;
+; px->x5 = y5;
+; }
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.structx_t = type { [3 x i8], i8, [3 x i8], i8, [3 x i8], i8, [3 x i8], i8, [2 x i8], [2 x i8] }
+
+define void @foo(%struct.structx_t* nocapture %px, i32 %y1, i32 %y2, i32 %y3, i32 %y4, i32 %y5) nounwind {
+entry:
+ %bf.value = and i32 %y1, 8388607
+ %0 = bitcast %struct.structx_t* %px to i32*
+ %1 = load i32, i32* %0, align 4
+ %2 = and i32 %1, -8388608
+ %3 = or i32 %2, %bf.value
+ store i32 %3, i32* %0, align 4
+ %bf.value1 = and i32 %y2, 131071
+ %bf.field.offs = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 4
+ %4 = bitcast i8* %bf.field.offs to i32*
+ %5 = load i32, i32* %4, align 4
+ %6 = and i32 %5, -131072
+ %7 = or i32 %6, %bf.value1
+ store i32 %7, i32* %4, align 4
+ %bf.value2 = and i32 %y3, 262143
+ %bf.field.offs3 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 8
+ %8 = bitcast i8* %bf.field.offs3 to i32*
+ %9 = load i32, i32* %8, align 4
+ %10 = and i32 %9, -262144
+ %11 = or i32 %10, %bf.value2
+ store i32 %11, i32* %8, align 4
+ %bf.value4 = and i32 %y4, 4194303
+ %bf.field.offs5 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 12
+ %12 = bitcast i8* %bf.field.offs5 to i32*
+ %13 = load i32, i32* %12, align 4
+ %14 = and i32 %13, -4194304
+ %15 = or i32 %14, %bf.value4
+ store i32 %15, i32* %12, align 4
+ %bf.value6 = and i32 %y5, 4095
+ %bf.field.offs7 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 16
+ %16 = bitcast i8* %bf.field.offs7 to i32*
+ %17 = load i32, i32* %16, align 4
+ %18 = and i32 %17, -4096
+ %19 = or i32 %18, %bf.value6
+ store i32 %19, i32* %16, align 4
+ ret void
+}
diff --git a/test/CodeGen/Hexagon/predicate-logical.ll b/test/CodeGen/Hexagon/predicate-logical.ll
new file mode 100644
index 000000000000..be2bcb03d6a1
--- /dev/null
+++ b/test/CodeGen/Hexagon/predicate-logical.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: p{{[0-9]}} = or(p{{[0-9]}}, and(p{{[0-9]}}, p{{[0-9]}}))
+
+target triple = "hexagon"
+
+define i32 @foo(i64* nocapture %p, i64* nocapture %q) nounwind readonly {
+entry:
+ %incdec.ptr = getelementptr inbounds i64, i64* %p, i32 1
+ %0 = load i64, i64* %p, align 8, !tbaa !0
+ %incdec.ptr1 = getelementptr inbounds i64, i64* %q, i32 1
+ %1 = load i64, i64* %q, align 8, !tbaa !0
+ %2 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %0, i64 %1)
+ %incdec.ptr2 = getelementptr inbounds i64, i64* %p, i32 2
+ %3 = load i64, i64* %incdec.ptr, align 8, !tbaa !0
+ %incdec.ptr3 = getelementptr inbounds i64, i64* %q, i32 2
+ %4 = load i64, i64* %incdec.ptr1, align 8, !tbaa !0
+ %5 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %3, i64 %4)
+ %6 = load i64, i64* %incdec.ptr2, align 8, !tbaa !0
+ %7 = load i64, i64* %incdec.ptr3, align 8, !tbaa !0
+ %8 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %6, i64 %7)
+ %and = and i32 %5, %2
+ %or = or i32 %8, %and
+ ret i32 %or
+}
+
+declare i32 @llvm.hexagon.A2.vcmpwgtu(i64, i64) nounwind readnone
+
+!0 = !{!"long long", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/predicate-rcmp.ll b/test/CodeGen/Hexagon/predicate-rcmp.ll
new file mode 100644
index 000000000000..45daa88d7161
--- /dev/null
+++ b/test/CodeGen/Hexagon/predicate-rcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: cmp.eq(r{{[0-9]+}}, #0)
+; Check that the result of the builtin is not stored directly, i.e. that
+; there is an instruction that converts it to {0,1} from {0,-1}. Right now
+; the instruction is "r4 = !cmp.eq(r0, #0)".
+
+@var = common global i32 0, align 4
+declare i32 @llvm.hexagon.C2.cmpgtup(i64,i64) nounwind
+
+define void @foo(i64 %a98, i64 %a100) nounwind {
+entry:
+ %a101 = tail call i32 @llvm.hexagon.C2.cmpgtup(i64 %a98, i64 %a100)
+ %tobool250 = icmp eq i32 %a101, 0
+ %a102 = zext i1 %tobool250 to i8
+ %detected.0 = xor i8 %a102, 1
+ %conv253 = zext i8 %detected.0 to i32
+ store i32 %conv253, i32* @var, align 4
+ ret void
+}
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
new file mode 100644
index 000000000000..d749a0524422
--- /dev/null
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -0,0 +1,25 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses basic block liveins correctly.
+
+--- |
+
+ define i32 @test(i32 %a, i32 %b) {
+ body:
+ %c = add i32 %a, %b
+ ret i32 %c
+ }
+
+...
+---
+name: test
+body:
+ # CHECK: name: body
+ # CHECK: liveins: [ '%edi', '%esi' ]
+ # CHECK-NEXT: instructions:
+ - id: 0
+ name: body
+ liveins: [ '%edi', '%esi' ]
+ instructions:
+ - '%eax = LEA64_32r killed %rdi, 1, killed %rsi, 0, _'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/dead-register-flag.mir b/test/CodeGen/MIR/X86/dead-register-flag.mir
new file mode 100644
index 000000000000..988b554659cb
--- /dev/null
+++ b/test/CodeGen/MIR/X86/dead-register-flag.mir
@@ -0,0 +1,26 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'dead' register flags
+# correctly.
+
+--- |
+
+ define i32 @foo(i32 %a) #0 {
+ body:
+ %c = mul i32 %a, 11
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: foo
+body:
+ # CHECK: name: body
+ - id: 0
+ name: body
+ instructions:
+ # CHECK: - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
+ - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
new file mode 100644
index 000000000000..c5f5aaca34e0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define i32 @foo(i32* %p) {
+ entry:
+ %a = load i32, i32* %p
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - '%eax = MOV32rm %rdi, 1, _, 0, _'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eax'
+ - id: 1
+ name: less
+ instructions:
+ - '%eax = MOV32r0 implicit-def %eflags'
+ - id: 2
+ name: exit
+ instructions:
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
new file mode 100644
index 000000000000..ecf3a122bf66
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define i32 @foo(i32* %p) {
+ entry:
+ %a = load i32, i32* %p
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - '%eax = MOV32rm %rdi, 1, _, 0, _'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
+ - 'JG_1 %bb.2.exit, implicit-def %eflags'
+ - id: 1
+ name: less
+ instructions:
+ - '%eax = MOV32r0 implicit-def %eflags'
+ - id: 2
+ name: exit
+ instructions:
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
new file mode 100644
index 000000000000..1fbe881c8c70
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
@@ -0,0 +1,21 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define i32 @test(i32 %a) {
+ body:
+ ret i32 %a
+ }
+
+...
+---
+name: test
+body:
+ - id: 0
+ name: body
+ # CHECK: [[@LINE+1]]:21: expected a named register
+ liveins: [ '%0' ]
+ instructions:
+ - '%eax = COPY %edi'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-number-after-bb.mir b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
index f4248a76be46..5343a847fbb9 100644
--- a/test/CodeGen/MIR/X86/expected-number-after-bb.mir
+++ b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
@@ -23,13 +23,13 @@ body:
name: entry
instructions:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
- - 'CMP32ri8 %eax, 10'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
# CHECK: [[@LINE+1]]:18: expected a number after '%bb.'
- - 'JG_1 %bb.nah'
+ - 'JG_1 %bb.nah, implicit %eflags'
- id: 1
name: yes
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 2
name: nah
instructions:
diff --git a/test/CodeGen/MIR/X86/expected-register-after-flags.mir b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
new file mode 100644
index 000000000000..111f5496a378
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when a register operand doesn't
+# follow register flags.
+
+--- |
+
+ define i32 @foo() {
+ entry:
+ ret i32 0
+ }
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ # CHECK: [[@LINE+1]]:37: expected a register after register flags
+ - '%eax = MOV32r0 implicit-def 2'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
new file mode 100644
index 000000000000..c891a115a180
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
@@ -0,0 +1,29 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define zeroext i1 @t(i1 %c) {
+ entry:
+ ret i1 %c
+ }
+
+...
+---
+name: t
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr8 }
+ - { id: 2, class: gr8 }
+body:
+ - name: entry
+ id: 0
+ instructions:
+ - '%0 = COPY %edi'
+ # CHECK: [[@LINE+1]]:25: expected a subregister index after ':'
+ - '%1 = COPY %0 : 42'
+ - '%2 = AND8ri %1, 1, implicit-def %eflags'
+ - '%al = COPY %2'
+ - 'RETQ %al'
+...
diff --git a/test/CodeGen/MIR/X86/fixed-stack-objects.mir b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
new file mode 100644
index 000000000000..dcbe6f73a6d0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
@@ -0,0 +1,35 @@
+# RUN: llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses fixed stack objects correctly.
+
+--- |
+
+ define i32 @test(i32 %a) #0 {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: test
+frameInfo:
+ stackSize: 4
+ maxAlignment: 4
+# CHECK: fixedStack:
+# CHECK-NEXT: - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+fixedStack:
+ - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+stack:
+ - { id: 0, offset: -8, size: 4, alignment: 4 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - '%eax = MOV32rm %esp, 1, _, 8, _'
+ - 'MOV32mr %esp, 1, _, 0, _, %eax'
+ - 'RETL %eax'
+...
diff --git a/test/CodeGen/MIR/X86/global-value-operands.mir b/test/CodeGen/MIR/X86/global-value-operands.mir
index 4aa88fe96ceb..3ea729b00554 100644
--- a/test/CodeGen/MIR/X86/global-value-operands.mir
+++ b/test/CodeGen/MIR/X86/global-value-operands.mir
@@ -31,7 +31,7 @@ body:
# CHECK: - '%rax = MOV64rm %rip, 1, _, @G, _'
- '%rax = MOV64rm %rip, 1, _, @G, _'
- '%eax = MOV32rm %rax, 1, _, 0, _'
- - '%eax = INC32r %eax'
+ - '%eax = INC32r %eax, implicit-def %eflags'
- 'RETQ %eax'
...
---
@@ -44,6 +44,6 @@ body:
# CHECK: - '%rax = MOV64rm %rip, 1, _, @0, _'
- '%rax = MOV64rm %rip, 1, _, @0, _'
- '%eax = MOV32rm %rax, 1, _, 0, _'
- - '%eax = INC32r %eax'
+ - '%eax = INC32r %eax, implicit-def %eflags'
- 'RETQ %eax'
...
diff --git a/test/CodeGen/MIR/X86/implicit-register-flag.mir b/test/CodeGen/MIR/X86/implicit-register-flag.mir
new file mode 100644
index 000000000000..9c6882d27bdc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/implicit-register-flag.mir
@@ -0,0 +1,41 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'implicit' and 'implicit-def'
+# register flags correctly.
+
+--- |
+
+ define i32 @foo(i32 %a) {
+ entry:
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ # CHECK: - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+ # CHECK-NEXT: - 'JG_1 %bb.2.exit, implicit %eflags'
+ - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eflags'
+ - id: 1
+ name: less
+ instructions:
+ # CHECK: - '%eax = MOV32r0 implicit-def %eflags'
+ - '%eax = MOV32r0 implicit-def %eflags'
+ - 'RETQ %eax'
+ - id: 2
+ name: exit
+ instructions:
+ - '%eax = COPY %edi'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/killed-register-flag.mir b/test/CodeGen/MIR/X86/killed-register-flag.mir
new file mode 100644
index 000000000000..d654a9d2fa56
--- /dev/null
+++ b/test/CodeGen/MIR/X86/killed-register-flag.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'killed' register flags
+# correctly.
+
+--- |
+
+ define i32 @foo(i32 %a) {
+ entry:
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eflags'
+ - id: 1
+ name: less
+ instructions:
+ # CHECK: - '%eax = MOV32r0
+ # CHECK-NEXT: - 'RETQ killed %eax
+ - '%eax = MOV32r0 implicit-def %eflags'
+ - 'RETQ killed %eax'
+ - id: 2
+ name: exit
+ instructions:
+ # CHECK: - '%eax = COPY killed %edi
+ # CHECK-NEXT: - 'RETQ killed %eax
+ - '%eax = COPY killed %edi'
+ - 'RETQ killed %eax'
+...
diff --git a/test/CodeGen/MIR/X86/large-index-number-error.mir b/test/CodeGen/MIR/X86/large-index-number-error.mir
index 61a5bdfe2edb..fdb25c907f52 100644
--- a/test/CodeGen/MIR/X86/large-index-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-index-number-error.mir
@@ -23,12 +23,12 @@ body:
name: entry
instructions:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
- - 'CMP32ri8 %eax, 10'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
# CHECK: [[@LINE+1]]:14: expected 32-bit integer (too large)
- - 'JG_1 %bb.123456789123456'
+ - 'JG_1 %bb.123456789123456, implicit %eflags'
- id: 1
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 2
instructions:
- 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
index 9d1bd0bd58ad..607acb5f273e 100644
--- a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
+++ b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
@@ -41,13 +41,13 @@ body:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
# CHECK: - 'CMP32ri8 %eax, 10
# CHECK-NEXT: - 'JG_1 %bb.2.exit
- - 'CMP32ri8 %eax, 10'
- - 'JG_1 %bb.2.exit'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eflags'
# CHECK: name: less
- id: 1
name: less
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 2
name: exit
instructions:
@@ -64,11 +64,11 @@ body:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
# CHECK: - 'CMP32ri8 %eax, 10
# CHECK-NEXT: - 'JG_1 %bb.2
- - 'CMP32ri8 %eax, 10'
- - 'JG_1 %bb.3'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+ - 'JG_1 %bb.3, implicit %eflags'
- id: 1
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 3
instructions:
- 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/machine-instructions.mir b/test/CodeGen/MIR/X86/machine-instructions.mir
index b743198cf270..08f3d76486b1 100644
--- a/test/CodeGen/MIR/X86/machine-instructions.mir
+++ b/test/CodeGen/MIR/X86/machine-instructions.mir
@@ -18,8 +18,8 @@ body:
- id: 0
name: entry
instructions:
- # CHECK: - IMUL32rri8
+ # CHECK: - MOV32rr
# CHECK-NEXT: - RETQ
- - IMUL32rri8
+ - MOV32rr
- ' RETQ '
...
diff --git a/test/CodeGen/MIR/X86/missing-implicit-operand.mir b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
new file mode 100644
index 000000000000..4d2cd03f4a3d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
@@ -0,0 +1,40 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when an instruction
+# is missing one of its implicit register operands.
+
+--- |
+
+ define i32 @foo(i32* %p) {
+ entry:
+ %a = load i32, i32* %p
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - '%eax = MOV32rm %rdi, 1, _, 0, _'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:24: missing implicit register operand 'implicit %eflags'
+ - 'JG_1 %bb.2.exit'
+ - id: 1
+ name: less
+ instructions:
+ - '%eax = MOV32r0 implicit-def %eflags'
+ - id: 2
+ name: exit
+ instructions:
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/named-registers.mir b/test/CodeGen/MIR/X86/named-registers.mir
index 5defb8489e1e..91ed48568678 100644
--- a/test/CodeGen/MIR/X86/named-registers.mir
+++ b/test/CodeGen/MIR/X86/named-registers.mir
@@ -18,6 +18,6 @@ body:
instructions:
# CHECK: - '%eax = MOV32r0
# CHECK-NEXT: - 'RETQ %eax
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- 'RETQ %eax'
...
diff --git a/test/CodeGen/MIR/X86/register-mask-operands.mir b/test/CodeGen/MIR/X86/register-mask-operands.mir
index ecaedeae4dbd..f4136598ff5c 100644
--- a/test/CodeGen/MIR/X86/register-mask-operands.mir
+++ b/test/CodeGen/MIR/X86/register-mask-operands.mir
@@ -24,7 +24,7 @@ body:
- id: 0
name: body
instructions:
- - '%eax = IMUL32rri8 %edi, 11'
+ - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
- 'RETQ %eax'
...
---
@@ -35,9 +35,9 @@ body:
name: entry
instructions:
# CHECK: - 'PUSH64r %rax
- # CHECK-NEXT: - 'CALL64pcrel32 @compute, csr_64, %rsp, %edi, %rsp, %eax'
- - 'PUSH64r %rax'
- - 'CALL64pcrel32 @compute, csr_64, %rsp, %edi, %rsp, %eax'
- - '%rdx = POP64r'
+ # CHECK-NEXT: - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+ - 'PUSH64r %rax, implicit-def %rsp, implicit %rsp'
+ - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+ - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
- 'RETQ %eax'
...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
new file mode 100644
index 000000000000..67f4bd21cd05
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define i32 @test(i32 %a) #0 {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: test
+frameInfo:
+ maxAlignment: 4
+fixedStack:
+ # CHECK: [[@LINE+1]]:63: unknown key 'isAliased'
+ - { id: 0, type: spill-slot, offset: 0, size: 4, isAliased: true }
+stack:
+ - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - '%eax = COPY %edi'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
new file mode 100644
index 000000000000..1e1b0fdcc8dc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define i32 @test(i32 %a) #0 {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: test
+frameInfo:
+ maxAlignment: 4
+fixedStack:
+ # CHECK: [[@LINE+1]]:65: unknown key 'isImmutable'
+ - { id: 0, type: spill-slot, offset: 0, size: 4, isImmutable: true }
+stack:
+ - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - '%eax = COPY %edi'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
new file mode 100644
index 000000000000..f771f796ec34
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
@@ -0,0 +1,34 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses fixed stack objects correctly.
+
+--- |
+
+ define i32 @test(i32 %a) #0 {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: test
+frameInfo:
+ maxAlignment: 4
+# CHECK: fixedStack:
+# CHECK-NEXT: - { id: 0, type: spill-slot, offset: 0, size: 4, alignment: 4 }
+fixedStack:
+ - { id: 0, type: spill-slot, offset: 0, size: 4, alignment: 4 }
+stack:
+ - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - '%eax = COPY %edi'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/stack-objects.mir b/test/CodeGen/MIR/X86/stack-objects.mir
new file mode 100644
index 000000000000..14ed4b74f96f
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-objects.mir
@@ -0,0 +1,39 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses stack objects correctly.
+
+--- |
+
+ define i32 @test(i32 %a) #0 {
+ entry:
+ %b = alloca i32
+ %x = alloca i64
+ store i32 %a, i32* %b
+ store i64 2, i64* %x
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: test
+frameInfo:
+ maxAlignment: 8
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0, offset: -12, size: 4, alignment: 4 }
+# CHECK-NEXT: - { id: 1, offset: -24, size: 8, alignment: 8 }
+# CHECK-NEXT: - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+stack:
+ - { id: 0, offset: -12, size: 4, alignment: 4 }
+ - { id: 1, offset: -24, size: 8, alignment: 8 }
+ - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+ - '%eax = MOV32rm %rsp, 1, _, -4, _'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/subregister-operands.mir b/test/CodeGen/MIR/X86/subregister-operands.mir
new file mode 100644
index 000000000000..5e46fab4b058
--- /dev/null
+++ b/test/CodeGen/MIR/X86/subregister-operands.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses subregisters in register operands
+# correctly.
+
+--- |
+
+ define zeroext i1 @t(i1 %c) {
+ entry:
+ ret i1 %c
+ }
+
+...
+---
+name: t
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr8 }
+ - { id: 2, class: gr8 }
+body:
+ - name: entry
+ id: 0
+ instructions:
+ # CHECK: %0 = COPY %edi
+ # CHECK-NEXT: %1 = COPY %0:sub_8bit
+ - '%0 = COPY %edi'
+ - '%1 = COPY %0:sub_8bit'
+ - '%2 = AND8ri %1, 1, implicit-def %eflags'
+ - '%al = COPY %2'
+ - 'RETQ %al'
+...
+
diff --git a/test/CodeGen/MIR/X86/undef-register-flag.mir b/test/CodeGen/MIR/X86/undef-register-flag.mir
new file mode 100644
index 000000000000..83b9e10a80d1
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undef-register-flag.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'undef' register flags
+# correctly.
+
+--- |
+
+ define i32 @compute(i32 %a) #0 {
+ body:
+ %c = mul i32 %a, 11
+ ret i32 %c
+ }
+
+ define i32 @foo(i32 %a) #0 {
+ entry:
+ %b = call i32 @compute(i32 %a)
+ ret i32 %b
+ }
+
+ attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name: compute
+body:
+ - id: 0
+ name: body
+ instructions:
+ - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
+ - 'RETQ %eax'
+...
+---
+name: foo
+body:
+ - id: 0
+ name: entry
+ instructions:
+ # CHECK: - 'PUSH64r undef %rax
+ - 'PUSH64r undef %rax, implicit-def %rsp, implicit %rsp'
+ - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+ - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/undefined-register-class.mir b/test/CodeGen/MIR/X86/undefined-register-class.mir
new file mode 100644
index 000000000000..a14d2303a7d8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-register-class.mir
@@ -0,0 +1,26 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters an
+# unknown register class.
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ ret i32 %a
+ }
+
+...
+---
+name: test
+isSSA: true
+tracksRegLiveness: true
+registers:
+ # CHECK: [[@LINE+1]]:20: use of undefined register class 'gr3200'
+ - {id: 0, class: 'gr3200'}
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'RETQ %eax'
+...
+
diff --git a/test/CodeGen/MIR/X86/undefined-virtual-register.mir b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
new file mode 100644
index 000000000000..12370c80caf9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
@@ -0,0 +1,28 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when parsing a
+# reference to an undefined virtual register.
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ ret i32 %a
+ }
+
+...
+---
+name: test
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - '%0 = COPY %edi'
+ # CHECK: [[@LINE+1]]:22: use of undefined virtual register '%10'
+ - '%eax = COPY %10'
+ - 'RETQ %eax'
+...
+
diff --git a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
index 5bc979a83eaf..a82e9a780f54 100644
--- a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
@@ -26,12 +26,12 @@ body:
name: entry
instructions:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
- - 'CMP32ri8 %eax, 10'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
# CHECK: [[@LINE+1]]:14: use of undefined machine basic block #4
- - 'JG_1 %bb.4'
+ - 'JG_1 %bb.4, implicit %eflags'
- id: 1
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 2
instructions:
- 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
index cd8c5402256f..f304113f40b9 100644
--- a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
@@ -25,13 +25,13 @@ body:
name: entry
instructions:
- '%eax = MOV32rm %rdi, 1, _, 0, _'
- - 'CMP32ri8 %eax, 10'
+ - 'CMP32ri8 %eax, 10, implicit-def %eflags'
# CHECK: [[@LINE+1]]:14: the name of machine basic block #2 isn't 'hit'
- - 'JG_1 %bb.2.hit'
+ - 'JG_1 %bb.2.hit, implicit %eflags'
- id: 1
name: less
instructions:
- - '%eax = MOV32r0'
+ - '%eax = MOV32r0 implicit-def %eflags'
- id: 2
name: exit
instructions:
diff --git a/test/CodeGen/MIR/X86/unknown-subregister-index.mir b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
new file mode 100644
index 000000000000..50461232b623
--- /dev/null
+++ b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
@@ -0,0 +1,31 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when an unknown subregister index
+# is encountered.
+
+--- |
+
+ define zeroext i1 @t(i1 %c) {
+ entry:
+ ret i1 %c
+ }
+
+...
+---
+name: t
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr8 }
+ - { id: 2, class: gr8 }
+body:
+ - name: entry
+ id: 0
+ instructions:
+ - '%0 = COPY %edi'
+ # CHECK: [[@LINE+1]]:23: use of unknown subregister index 'bit8'
+ - '%1 = COPY %0:bit8'
+ - '%2 = AND8ri %1, 1, implicit-def %eflags'
+ - '%al = COPY %2'
+ - 'RETQ %al'
+...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
new file mode 100644
index 000000000000..8e50c52f5e18
--- /dev/null
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
@@ -0,0 +1,36 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ %b = alloca i32
+ %x = alloca i64
+ %y = alloca i32, i32 %a
+ store i32 %a, i32* %b
+ store i64 2, i64* %x
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+...
+---
+name: test
+frameInfo:
+ stackSize: 24
+ offsetAdjustment: -16
+ maxAlignment: 8
+ adjustsStack: true
+stack:
+ - { id: 0, offset: -20, size: 4, alignment: 4 }
+ - { id: 1, offset: -32, size: 8, alignment: 8 }
+ # CHECK: [[@LINE+1]]:55: unknown key 'size'
+ - { id: 2, type: variable-sized, offset: -32, size: 42, alignment: 1 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+ - '%eax = MOV32rm %rsp, 1, _, -4, _'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
new file mode 100644
index 000000000000..4c45742b25a4
--- /dev/null
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses variable sized stack objects
+# correctly.
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ %b = alloca i32
+ %x = alloca i64
+ %y = alloca i32, i32 %a
+ store i32 %a, i32* %b
+ store i64 2, i64* %x
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+...
+---
+name: test
+frameInfo:
+ stackSize: 24
+ offsetAdjustment: -16
+ maxAlignment: 8
+ adjustsStack: true
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0, offset: -20, size: 4, alignment: 4 }
+# CHECK-NEXT: - { id: 1, offset: -32, size: 8, alignment: 8 }
+# CHECK-NEXT: - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
+stack:
+ - { id: 0, offset: -20, size: 4, alignment: 4 }
+ - { id: 1, offset: -32, size: 8, alignment: 8 }
+ - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
+body:
+ - id: 0
+ name: entry
+ instructions:
+ - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+ - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+ - '%eax = MOV32rm %rsp, 1, _, -4, _'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/virtual-registers.mir b/test/CodeGen/MIR/X86/virtual-registers.mir
new file mode 100644
index 000000000000..c6d76e6a18c5
--- /dev/null
+++ b/test/CodeGen/MIR/X86/virtual-registers.mir
@@ -0,0 +1,105 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses virtual register definitions and
+# references correctly.
+
+--- |
+
+ define i32 @bar(i32 %a) {
+ entry:
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+ define i32 @foo(i32 %a) {
+ entry:
+ %0 = icmp sle i32 %a, 10
+ br i1 %0, label %less, label %exit
+
+ less:
+ ret i32 0
+
+ exit:
+ ret i32 %a
+ }
+
+...
+---
+name: bar
+isSSA: true
+tracksRegLiveness: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr32 }
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr32 }
+ - { id: 2, class: gr32 }
+body:
+ - id: 0
+ name: entry
+ # CHECK: %0 = COPY %edi
+ # CHECK-NEXT: %1 = SUB32ri8 %0, 10
+ instructions:
+ - '%0 = COPY %edi'
+ - '%1 = SUB32ri8 %0, 10, implicit-def %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eflags'
+ - 'JMP_1 %bb.1.less'
+ - id: 1
+ name: less
+ # CHECK: %2 = MOV32r0
+ # CHECK-NEXT: %eax = COPY %2
+ instructions:
+ - '%2 = MOV32r0 implicit-def %eflags'
+ - '%eax = COPY %2'
+ - 'RETQ %eax'
+ - id: 2
+ name: exit
+ instructions:
+ - '%eax = COPY %0'
+ - 'RETQ %eax'
+...
+---
+name: foo
+isSSA: true
+tracksRegLiveness: true
+# CHECK: name: foo
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr32 }
+registers:
+ - { id: 2, class: gr32 }
+ - { id: 0, class: gr32 }
+ - { id: 10, class: gr32 }
+body:
+ - id: 0
+ name: entry
+ # CHECK: %0 = COPY %edi
+ # CHECK-NEXT: %1 = SUB32ri8 %0, 10
+ instructions:
+ - '%2 = COPY %edi'
+ - '%0 = SUB32ri8 %2, 10, implicit-def %eflags'
+ - 'JG_1 %bb.2.exit, implicit %eflags'
+ - 'JMP_1 %bb.1.less'
+ - id: 1
+ name: less
+ # CHECK: %2 = MOV32r0
+ # CHECK-NEXT: %eax = COPY %2
+ instructions:
+ - '%10 = MOV32r0 implicit-def %eflags'
+ - '%eax = COPY %10'
+ - 'RETQ %eax'
+ - id: 2
+ name: exit
+ # CHECK: %eax = COPY %0
+ instructions:
+ - '%eax = COPY %2'
+ - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/frame-info.mir b/test/CodeGen/MIR/frame-info.mir
new file mode 100644
index 000000000000..c5468f94f33a
--- /dev/null
+++ b/test/CodeGen/MIR/frame-info.mir
@@ -0,0 +1,91 @@
+# RUN: llc -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine frame info properties
+# correctly.
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+ define i32 @test2(i32 %a) {
+ entry:
+ %b = alloca i32
+ store i32 %a, i32* %b
+ %c = load i32, i32* %b
+ ret i32 %c
+ }
+
+...
+---
+name: test
+isSSA: true
+tracksRegLiveness: true
+
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: false
+# CHECK-NEXT: isReturnAddressTaken: false
+# CHECK-NEXT: hasStackMap: false
+# CHECK-NEXT: hasPatchPoint: false
+# CHECK-NEXT: stackSize: 0
+# CHECK-NEXT: offsetAdjustment: 0
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: false
+# CHECK-NEXT: hasCalls: false
+# CHECK-NEXT: maxCallFrameSize: 0
+# CHECK-NEXT: hasOpaqueSPAdjustment: false
+# CHECK-NEXT: hasVAStart: false
+# CHECK-NEXT: hasMustTailInVarArgFunc: false
+# CHECK: body
+frameInfo:
+ maxAlignment: 4
+body:
+ - id: 0
+ name: entry
+...
+---
+name: test2
+isSSA: true
+tracksRegLiveness: true
+
+# CHECK: test2
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: true
+# CHECK-NEXT: isReturnAddressTaken: true
+# CHECK-NEXT: hasStackMap: true
+# CHECK-NEXT: hasPatchPoint: true
+# CHECK-NEXT: stackSize: 4
+# CHECK-NEXT: offsetAdjustment: 4
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: true
+# CHECK-NEXT: hasCalls: true
+# CHECK-NEXT: maxCallFrameSize: 4
+# CHECK-NEXT: hasOpaqueSPAdjustment: true
+# CHECK-NEXT: hasVAStart: true
+# CHECK-NEXT: hasMustTailInVarArgFunc: true
+# CHECK: body
+frameInfo:
+ isFrameAddressTaken: true
+ isReturnAddressTaken: true
+ hasStackMap: true
+ hasPatchPoint: true
+ stackSize: 4
+ offsetAdjustment: 4
+ maxAlignment: 4
+ adjustsStack: true
+ hasCalls: true
+ maxCallFrameSize: 4
+ hasOpaqueSPAdjustment: true
+ hasVAStart: true
+ hasMustTailInVarArgFunc: true
+body:
+ - id: 0
+ name: entry
+...
+
diff --git a/test/CodeGen/MIR/llvmIR.mir b/test/CodeGen/MIR/llvmIR.mir
index 4d7fde240c5b..3c084ad7d393 100644
--- a/test/CodeGen/MIR/llvmIR.mir
+++ b/test/CodeGen/MIR/llvmIR.mir
@@ -32,4 +32,6 @@
...
---
name: foo
+body:
+ - id: 0
...
diff --git a/test/CodeGen/MIR/llvmIRMissing.mir b/test/CodeGen/MIR/llvmIRMissing.mir
index 83d846ba44c3..80cea5a6fdaa 100644
--- a/test/CodeGen/MIR/llvmIRMissing.mir
+++ b/test/CodeGen/MIR/llvmIRMissing.mir
@@ -4,4 +4,6 @@
---
# CHECK: name: foo
name: foo
+body:
+ - id: 0
...
diff --git a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir b/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
index ed675c5edbc3..df8eee9d2708 100644
--- a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
+++ b/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
@@ -13,7 +13,7 @@
---
name: foo
body:
- # CHECK: basic block 'entrie' is not defined in the function 'foo'
+ # CHECK: [[@LINE+2]]:18: basic block 'entrie' is not defined in the function 'foo'
- id: 0
name: entrie
...
diff --git a/test/CodeGen/MIR/machine-function-missing-body-error.mir b/test/CodeGen/MIR/machine-function-missing-body-error.mir
new file mode 100644
index 000000000000..0dc7477f6275
--- /dev/null
+++ b/test/CodeGen/MIR/machine-function-missing-body-error.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters a
+# machine function with an empty body.
+
+--- |
+
+ define i32 @foo() {
+ ret i32 0
+ }
+
+...
+---
+# CHECK: machine function 'foo' requires at least one machine basic block in its body
+name: foo
+...
diff --git a/test/CodeGen/MIR/machine-function-missing-function.mir b/test/CodeGen/MIR/machine-function-missing-function.mir
index eed4142d6597..424c34aae847 100644
--- a/test/CodeGen/MIR/machine-function-missing-function.mir
+++ b/test/CodeGen/MIR/machine-function-missing-function.mir
@@ -12,8 +12,12 @@
...
---
name: foo
+body:
+ - id: 0
...
---
# CHECK: function 'faa' isn't defined in the provided LLVM IR
name: faa
+body:
+ - id: 0
...
diff --git a/test/CodeGen/MIR/machine-function-missing-name.mir b/test/CodeGen/MIR/machine-function-missing-name.mir
index b16156e54bd1..a868a65d35f2 100644
--- a/test/CodeGen/MIR/machine-function-missing-name.mir
+++ b/test/CodeGen/MIR/machine-function-missing-name.mir
@@ -16,7 +16,11 @@
---
# CHECK: [[@LINE+1]]:1: missing required key 'name'
nme: foo
+body:
+ - id: 0
...
---
name: bar
+body:
+ - id: 0
...
diff --git a/test/CodeGen/MIR/machine-function.mir b/test/CodeGen/MIR/machine-function.mir
index 8f053adc22be..afd10ab02c26 100644
--- a/test/CodeGen/MIR/machine-function.mir
+++ b/test/CodeGen/MIR/machine-function.mir
@@ -27,6 +27,8 @@
# CHECK-NEXT: hasInlineAsm: false
# CHECK: ...
name: foo
+body:
+ - id: 0
...
---
# CHECK: name: bar
@@ -35,6 +37,8 @@ name: foo
# CHECK-NEXT: hasInlineAsm: false
# CHECK: ...
name: bar
+body:
+ - id: 0
...
---
# CHECK: name: func
@@ -44,6 +48,8 @@ name: bar
# CHECK: ...
name: func
alignment: 8
+body:
+ - id: 0
...
---
# CHECK: name: func2
@@ -55,4 +61,6 @@ name: func2
alignment: 16
exposesReturnsTwice: true
hasInlineAsm: true
+body:
+ - id: 0
...
diff --git a/test/CodeGen/MIR/register-info.mir b/test/CodeGen/MIR/register-info.mir
index c01997b46859..9585faa96223 100644
--- a/test/CodeGen/MIR/register-info.mir
+++ b/test/CodeGen/MIR/register-info.mir
@@ -22,6 +22,8 @@
# CHECK-NEXT: tracksSubRegLiveness: false
# CHECK: ...
name: foo
+body:
+ - id: 0
...
---
# CHECK: name: bar
@@ -33,4 +35,6 @@ name: bar
isSSA: false
tracksRegLiveness: true
tracksSubRegLiveness: true
+body:
+ - id: 0
...
diff --git a/test/CodeGen/NVPTX/loop-vectorize.ll b/test/CodeGen/NVPTX/loop-vectorize.ll
new file mode 100644
index 000000000000..1b337441ac96
--- /dev/null
+++ b/test/CodeGen/NVPTX/loop-vectorize.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @no_vectorization(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: no_vectorization(
+; CHECK-NOT: <4 x i32>
+; CHECK-NOT: <4 x i1>
+entry:
+ %cmp.5 = icmp sgt i32 %n, 0
+ br i1 %cmp.5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %add = add nsw i32 %i.06, %a
+ %mul = mul nsw i32 %add, %b
+ %cmp1 = icmp sgt i32 %mul, -1
+ tail call void @llvm.assume(i1 %cmp1)
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }
+
+!nvvm.annotations = !{!0}
+!0 = !{void (i32, i32, i32)* @no_vectorization, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
new file mode 100644
index 000000000000..c3adfc4646cf
--- /dev/null
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
+; llvm.mem* intrinsics get lowered to loops.
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
+ ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_caller
+; CHECK: LBB[[LABEL:[_0-9]+]]:
+; CHECK: ld.u8 %rs[[REG:[0-9]+]]
+; CHECK: st.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true)
+ ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_volatile_caller
+; CHECK: LBB[[LABEL:[_0-9]+]]:
+; CHECK: ld.volatile.u8 %rs[[REG:[0-9]+]]
+; CHECK: st.volatile.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
+entry:
+ %0 = trunc i32 %c to i8
+ tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 false)
+ ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memset_caller(
+; CHECK: ld.param.u8 %rs[[REG:[0-9]+]]
+; CHECK: LBB[[LABEL:[_0-9]+]]:
+; CHECK: st.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
diff --git a/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll b/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
index 16dc2ccb111d..6013a412924f 100644
--- a/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
+++ b/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
@@ -134,6 +134,36 @@ entry:
; CHECK: xvcmpgtsp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
}
+; Function Attrs: nounwind
+define <4 x float> @emit_xvresp(<4 x float> %a) {
+entry:
+ %a.addr = alloca <4 x float>, align 16
+ store <4 x float> %a, <4 x float>* %a.addr, align 16
+ %0 = load <4 x float>, <4 x float>* %a.addr, align 16
+ %1 = call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> %0)
+ ret <4 x float> %1
+; CHECK-LABEL: @emit_xvresp
+; CHECK: xvresp {{[0-9]+}}, {{[0-9]+}}
+}
+
+; Function Attrs: nounwind
+define <2 x double> @emit_xvredp(<2 x double> %a) {
+entry:
+ %a.addr = alloca <2 x double>, align 16
+ store <2 x double> %a, <2 x double>* %a.addr, align 16
+ %0 = load <2 x double>, <2 x double>* %a.addr, align 16
+ %1 = call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> %0)
+ ret <2 x double> %1
+; CHECK-LABEL: @emit_xvredp
+; CHECK: xvredp {{[0-9]+}}, {{[0-9]+}}
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>)
+
; Function Attrs: nounwind readnone
declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
diff --git a/test/CodeGen/PowerPC/ppc-crbits-onoff.ll b/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
index 88648df5fa36..c69f30017d88 100644
--- a/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
+++ b/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
@@ -15,8 +15,8 @@ entry:
; CHECK-DAG: cmplwi {{[0-9]+}}, 3, 0
; CHECK-DAG: li [[REG2:[0-9]+]], 1
; CHECK-DAG: cntlzw [[REG3:[0-9]+]],
-; CHECK: isel 3, 0, [[REG2]]
-; CHECK: and 3, 3, [[REG3]]
+; CHECK: isel [[REG4:[0-9]+]], 0, [[REG2]]
+; CHECK: and 3, [[REG4]], [[REG3]]
; CHECK: blr
}
diff --git a/test/CodeGen/PowerPC/ppc32-nest.ll b/test/CodeGen/PowerPC/ppc32-nest.ll
new file mode 100644
index 000000000000..ed7bbe2b8f37
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-nest.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register (r11 for PPC).
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: # BB#0:
+; CHECK-NEXT: mr 3, 11
+; CHECK-NEXT: blr
+
+ ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mr 11, 3
+; CHECK-NEXT: bl nest_receiver
+; CHECK: blr
+
+ %result = call i8* @nest_receiver(i8* nest %arg)
+ ret i8* %result
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc.ll b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
index 66f6a2c790c6..ff0768ff47ed 100644
--- a/test/CodeGen/PowerPC/ppc64-anyregcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
@@ -82,7 +82,7 @@ target triple = "powerpc64-unknown-linux-gnu"
; CHECK-NEXT: .long 3
define i64 @test() nounwind ssp uwtable {
entry:
- call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 24, i8* null, i32 2, i32 1, i32 2, i64 3)
+ call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 40, i8* null, i32 2, i32 1, i32 2, i64 3)
ret i64 0
}
@@ -104,7 +104,7 @@ entry:
define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 24, i8* %f, i32 1, i8* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 40, i8* %f, i32 1, i8* %obj)
ret i64 %ret
}
@@ -127,7 +127,7 @@ define i64 @property_access2() nounwind ssp uwtable {
entry:
%obj = alloca i64, align 8
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %f, i32 1, i64* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 40, i8* %f, i32 1, i64* %obj)
ret i64 %ret
}
@@ -150,7 +150,7 @@ define i64 @property_access3() nounwind ssp uwtable {
entry:
%obj = alloca i64, align 8
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 24, i8* %f, i32 0, i64* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 40, i8* %f, i32 0, i64* %obj)
ret i64 %ret
}
@@ -232,7 +232,7 @@ entry:
define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 24, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 40, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
ret i64 %ret
}
@@ -314,7 +314,7 @@ entry:
define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 40, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
ret i64 %ret
}
@@ -342,7 +342,7 @@ entry:
; CHECK-NEXT: .long 0
define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
- %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 40, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
ret i64 %result
@@ -384,7 +384,7 @@ define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
- %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 40, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
ret i64 %result
}
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
index f90519836c25..92d6d556738c 100644
--- a/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
+++ b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
@@ -35,7 +35,7 @@ define fastcc double @f2(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, doub
}
define void @cg2(i64 %v) #0 {
- tail call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
+ call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
ret void
; CHECK-LABEL: @cg2
@@ -44,11 +44,11 @@ define void @cg2(i64 %v) #0 {
}
define void @cf2(double %v) #0 {
- tail call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
+ call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
ret void
; CHECK-LABEL: @cf2
-; CHECK: mr 2, 1
+; CHECK: fmr 2, 1
; CHECK: blr
}
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc.ll b/test/CodeGen/PowerPC/ppc64-fastcc.ll
index bb1365a3b675..69e15d104da8 100644
--- a/test/CodeGen/PowerPC/ppc64-fastcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-fastcc.ll
@@ -521,8 +521,9 @@ define void @cv13(<4 x i32> %v) #0 {
ret void
; CHECK-LABEL: @cv13
-; CHECK: li [[REG1:[0-9]+]], 96
-; CHECK: stvx 2, 1, [[REG1]]
+; CHECK-DAG: li [[REG1:[0-9]+]], 96
+; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK: stvx [[REG2]], 1, [[REG1]]
; CHECK: blr
}
@@ -531,8 +532,9 @@ define void @cv14(<4 x i32> %v) #0 {
ret void
; CHECK-LABEL: @cv14
-; CHECK: li [[REG1:[0-9]+]], 128
-; CHECK: stvx 2, 1, [[REG1]]
+; CHECK-DAG: li [[REG1:[0-9]+]], 128
+; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK: stvx [[REG2]], 1, [[REG1]]
; CHECK: blr
}
diff --git a/test/CodeGen/PowerPC/ppc64-nest.ll b/test/CodeGen/PowerPC/ppc64-nest.ll
new file mode 100644
index 000000000000..9dd88db2fb5d
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-nest.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register (r11 for PPC).
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: # BB#0:
+; CHECK-NEXT: mr 3, 11
+; CHECK-NEXT: blr
+
+ ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mr 11, 3
+; CHECK-NEXT: bl nest_receiver
+; CHECK: blr
+
+ %result = call i8* @nest_receiver(i8* nest %arg)
+ ret i8* %result
+}
+
+define void @test_indirect(i32 ()* nocapture %f, i8* %p) {
+entry:
+
+; CHECK-LABEL: test_indirect
+; CHECK-DAG: ld [[DEST:[0-9]+]], 0(3)
+; CHECK-DAG: ld 2, 8(3)
+; CHECK-DAG: mr 11, 4
+; CHECK: mtctr [[DEST]]
+; CHECK: bctrl
+; CHECK: blr
+
+ %callee.knr.cast = bitcast i32 ()* %f to i32 (i8*)*
+ %call = tail call signext i32 %callee.knr.cast(i8* nest %p)
+ ret void
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-patchpoint.ll b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
index 67b26268a3a3..53b737ae9a0b 100644
--- a/test/CodeGen/PowerPC/ppc64-patchpoint.ll
+++ b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
@@ -15,22 +15,34 @@ entry:
; CHECK-NEXT: rldic 12, 12, 32, 16
; CHECK-NEXT: oris 12, 12, 48879
; CHECK-NEXT: ori 12, 12, 51966
+; CHECK-LE-NEXT: std 2, 24(1)
+; CHECK-BE-NEXT: std 2, 40(1)
+; CHECK-BE-NEXT: ld 2, 8(12)
+; CHECK-BE-NEXT: ld 12, 0(12)
; CHECK-NEXT: mtctr 12
; CHECK-NEXT: bctrl
+; CHECK-LE-NEXT: ld 2, 24(1)
+; CHECK-BE-NEXT: ld 2, 40(1)
; CHECK: li 12, -8531
; CHECK-NEXT: rldic 12, 12, 32, 16
; CHECK-NEXT: oris 12, 12, 48879
; CHECK-NEXT: ori 12, 12, 51967
+; CHECK-LE-NEXT: std 2, 24(1)
+; CHECK-BE-NEXT: std 2, 40(1)
+; CHECK-BE-NEXT: ld 2, 8(12)
+; CHECK-BE-NEXT: ld 12, 0(12)
; CHECK-NEXT: mtctr 12
; CHECK-NEXT: bctrl
+; CHECK-LE-NEXT: ld 2, 24(1)
+; CHECK-BE-NEXT: ld 2, 40(1)
; CHECK: blr
%resolveCall2 = inttoptr i64 244837814094590 to i8*
- %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 40, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
%resolveCall3 = inttoptr i64 244837814094591 to i8*
- tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 24, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 40, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
ret i64 %result
}
@@ -65,13 +77,13 @@ entry:
%tmp81 = inttoptr i64 %tmp80 to i64*
%tmp82 = load i64, i64* %tmp81, align 8
tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
- tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 48, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
%tmp83 = load i64, i64* %tmp33, align 8
%tmp84 = add i64 %tmp83, -24
%tmp85 = inttoptr i64 %tmp84 to i64*
%tmp86 = load i64, i64* %tmp85, align 8
tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
- tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 48, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
ret i64 10
}
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap.ll b/test/CodeGen/PowerPC/ppc64-stackmap.ll
index 917fa7422512..a77339f8e475 100644
--- a/test/CodeGen/PowerPC/ppc64-stackmap.ll
+++ b/test/CodeGen/PowerPC/ppc64-stackmap.ll
@@ -112,7 +112,7 @@ target triple = "powerpc64-unknown-linux-gnu"
define void @constantargs() {
entry:
%0 = inttoptr i64 244837814094590 to i8*
- tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 24, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 40, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
ret void
}
@@ -160,7 +160,7 @@ entry:
cold:
; OSR patchpoint with 12-byte nop-slide and 2 live vars.
%thunk = inttoptr i64 244837814094590 to i8*
- call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 24, i8* %thunk, i32 0, i64 %a, i64 %b)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 40, i8* %thunk, i32 0, i64 %a, i64 %b)
unreachable
ret:
ret void
@@ -176,7 +176,7 @@ ret:
define i64 @propertyRead(i64* %obj) {
entry:
%resolveRead = inttoptr i64 244837814094590 to i8*
- %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %resolveRead, i32 1, i64* %obj)
+ %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 40, i8* %resolveRead, i32 1, i64* %obj)
%add = add i64 %result, 3
ret i64 %add
}
@@ -196,7 +196,7 @@ entry:
define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
entry:
%resolveWrite = inttoptr i64 244837814094590 to i8*
- call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 24, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+ call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 40, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
ret void
}
@@ -218,7 +218,7 @@ entry:
define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
entry:
%resolveCall = inttoptr i64 244837814094590 to i8*
- call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 40, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
ret void
}
@@ -240,7 +240,7 @@ entry:
define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
entry:
%resolveCall = inttoptr i64 244837814094590 to i8*
- %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 40, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
%add = add i64 %result, 3
ret i64 %add
}
@@ -260,7 +260,7 @@ entry:
; CHECK-NEXT: .short 31
define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
entry:
- call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 24, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 40, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
ret void
}
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index cd77548b281b..41dcb0f5b3fc 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx -recip=sqrtf:0,sqrtd:0 | FileCheck %s -check-prefix=CHECK-NONR
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck -check-prefix=CHECK-SAFE %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -24,6 +25,13 @@ define double @foo(double %a, double %b) nounwind {
; CHECK-NEXT: fmul
; CHECK: blr
+; CHECK-NONR: @foo
+; CHECK-NONR: frsqrte
+; CHECK-NONR-NOT: fmadd
+; CHECK-NONR: fmul
+; CHECK-NONR-NOT: fmadd
+; CHECK-NONR: blr
+
; CHECK-SAFE: @foo
; CHECK-SAFE: fsqrt
; CHECK-SAFE: fdiv
@@ -90,6 +98,13 @@ define float @goo(float %a, float %b) nounwind {
; CHECK-NEXT: fmuls
; CHECK-NEXT: blr
+; CHECK-NONR: @goo
+; CHECK-NONR: frsqrtes
+; CHECK-NONR-NOT: fmadds
+; CHECK-NONR: fmuls
+; CHECK-NONR-NOT: fmadds
+; CHECK-NONR: blr
+
; CHECK-SAFE: @goo
; CHECK-SAFE: fsqrts
; CHECK-SAFE: fdivs
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 62403e711968..dcbdd69d5d50 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -18,10 +18,10 @@ entry:
; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
; CHECK: ld 31, 0([[REG]])
; CHECK: ld [[REG2:[0-9]+]], 8([[REG]])
-; CHECK: ld 1, 16([[REG]])
-; CHECK: mtctr [[REG2]]
-; CHECK: ld 30, 32([[REG]])
-; CHECK: ld 2, 24([[REG]])
+; CHECK-DAG: ld 1, 16([[REG]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK-DAG: ld 30, 32([[REG]])
+; CHECK-DAG: ld 2, 24([[REG]])
; CHECK: bctr
return: ; No predecessors!
diff --git a/test/CodeGen/PowerPC/swaps-le-3.ll b/test/CodeGen/PowerPC/swaps-le-3.ll
index 0c1748df9fcd..49b93976d310 100644
--- a/test/CodeGen/PowerPC/swaps-le-3.ll
+++ b/test/CodeGen/PowerPC/swaps-le-3.ll
@@ -17,8 +17,8 @@ entry:
}
; CHECK-LABEL: @test
-; CHECK: xxspltd
-; CHECK: lxvd2x
+; CHECK-DAG: xxspltd
+; CHECK-DAG: lxvd2x
; CHECK: xvadddp
; CHECK: stxvd2x
; CHECK-NOT: xxswapd
diff --git a/test/CodeGen/PowerPC/swaps-le-5.ll b/test/CodeGen/PowerPC/swaps-le-5.ll
new file mode 100644
index 000000000000..5cd739a0efa9
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-5.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
+
+; These tests verify that VSX swap optimization works for various
+; manipulations of <2 x double> vectors.
+
+@x = global <2 x double> <double 9.970000e+01, double -1.032220e+02>, align 16
+@z = global <2 x double> <double 2.332000e+01, double 3.111111e+01>, align 16
+
+define void @bar0(double %y) {
+entry:
+ %0 = load <2 x double>, <2 x double>* @x, align 16
+ %vecins = insertelement <2 x double> %0, double %y, i32 0
+ store <2 x double> %vecins, <2 x double>* @z, align 16
+ ret void
+}
+
+; CHECK-LABEL: @bar0
+; CHECK-DAG: xxswapd {{[0-9]+}}, 1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
+; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
+; CHECK: stxvd2x [[REG3]]
+
+define void @bar1(double %y) {
+entry:
+ %0 = load <2 x double>, <2 x double>* @x, align 16
+ %vecins = insertelement <2 x double> %0, double %y, i32 1
+ store <2 x double> %vecins, <2 x double>* @z, align 16
+ ret void
+}
+
+; CHECK-LABEL: @bar1
+; CHECK-DAG: xxswapd {{[0-9]+}}, 1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
+; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: stxvd2x [[REG3]]
+
+define void @baz0() {
+entry:
+ %0 = load <2 x double>, <2 x double>* @z, align 16
+ %1 = load <2 x double>, <2 x double>* @x, align 16
+ %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
+ store <2 x double> %vecins, <2 x double>* @z, align 16
+ ret void
+}
+
+; CHECK-LABEL: @baz0
+; CHECK: lxvd2x
+; CHECK: lxvd2x
+; CHECK: xxmrghd
+; CHECK: stxvd2x
+; CHECK-NOT: xxswapd
+
+define void @baz1() {
+entry:
+ %0 = load <2 x double>, <2 x double>* @z, align 16
+ %1 = load <2 x double>, <2 x double>* @x, align 16
+ %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 3, i32 1>
+ store <2 x double> %vecins, <2 x double>* @z, align 16
+ ret void
+}
+
+; CHECK-LABEL: @baz1
+; CHECK: lxvd2x
+; CHECK: lxvd2x
+; CHECK: xxmrgld
+; CHECK: stxvd2x
+; CHECK-NOT: xxswapd
+
diff --git a/test/CodeGen/PowerPC/tls-store2.ll b/test/CodeGen/PowerPC/tls-store2.ll
index e9aa17e8c0ff..649508637f4e 100644
--- a/test/CodeGen/PowerPC/tls-store2.ll
+++ b/test/CodeGen/PowerPC/tls-store2.ll
@@ -29,6 +29,8 @@ entry:
; CHECK: addi 3, {{[0-9]+}}, __once_call@got@tlsgd@l
; CHECK: bl __tls_get_addr(__once_call@tlsgd)
; CHECK-NEXT: nop
-; CHECK: std {{[0-9]+}}, 0(3)
+; FIXME: We don't really need the copy here either, we could move the store up.
+; CHECK: mr [[REG1:[0-9]+]], 3
+; CHECK: std {{[0-9]+}}, 0([[REG1]])
declare void @__once_call_impl()
diff --git a/test/CodeGen/PowerPC/vsx-elementary-arith.ll b/test/CodeGen/PowerPC/vsx-elementary-arith.ll
index d8f76bb989e7..5416f667aef1 100644
--- a/test/CodeGen/PowerPC/vsx-elementary-arith.ll
+++ b/test/CodeGen/PowerPC/vsx-elementary-arith.ll
@@ -116,5 +116,36 @@ entry:
; CHECK: xssqrtdp {{[0-9]+}}
}
+; Vector forms
+; Function Attrs: nounwind
+define <4 x float> @emit_xvrsqrtesp() {
+entry:
+; CHECK-LABEL: @emit_xvrsqrtesp
+ %vf = alloca <4 x float>, align 16
+ %vfr = alloca <4 x float>, align 16
+ %0 = load <4 x float>, <4 x float>* %vf, align 16
+ %call = call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %0)
+; CHECK: xvrsqrtesp {{[0-9]+}}, {{[0-9]+}}
+ ret <4 x float> %call
+}
+
+; Function Attrs: nounwind
+define <2 x double> @emit_xvrsqrtedp() {
+entry:
+; CHECK-LABEL: @emit_xvrsqrtedp
+ %vd = alloca <2 x double>, align 16
+ %vdr = alloca <2 x double>, align 16
+ %0 = load <2 x double>, <2 x double>* %vd, align 16
+ %call = call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %0)
+ ret <2 x double> %call
+; CHECK: xvrsqrtedp {{[0-9]+}}, {{[0-9]+}}
+}
+
; Function Attrs: nounwind
declare double @sqrt(double)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double>)
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index d85927396e3e..4f556b6b79c2 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -49,12 +49,13 @@ entry:
; CHECK-LABEL: @test2
; CHECK-DAG: li [[C1:[0-9]+]], 8
; CHECK-DAG: li [[C2:[0-9]+]], 16
-; CHECK-DAG: xsmaddmdp 3, 2, 1
-; CHECK-DAG: xsmaddmdp 4, 2, 1
-; CHECK-DAG: xsmaddadp 1, 2, 5
-; CHECK-DAG: stxsdx 3, 0, 8
-; CHECK-DAG: stxsdx 4, 8, [[C1]]
-; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; FIXME: We no longer get this because of copy ordering at the MI level.
+; CHECX-DAG: xsmaddmdp 3, 2, 1
+; CHECX-DAG: xsmaddmdp 4, 2, 1
+; CHECX-DAG: xsmaddadp 1, 2, 5
+; CHECX-DAG: stxsdx 3, 0, 8
+; CHECX-DAG: stxsdx 4, 8, [[C1]]
+; CHECX-DAG: stxsdx 1, 8, [[C2]]
; CHECK: blr
; CHECK-FISL-LABEL: @test2
@@ -213,14 +214,15 @@ entry:
ret void
; CHECK-LABEL: @testv2
-; CHECK-DAG: xvmaddmdp 36, 35, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
-; CHECK-DAG: li [[C1:[0-9]+]], 16
-; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
-; CHECK-DAG: stxvd2x 36, 0, 3
-; CHECK-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
-; CHECK-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
+; FIXME: We currently don't get this because of copy ordering on the MI level.
+; CHECX-DAG: xvmaddmdp 36, 35, 34
+; CHECX-DAG: xvmaddmdp 37, 35, 34
+; CHECX-DAG: li [[C1:[0-9]+]], 16
+; CHECX-DAG: li [[C2:[0-9]+]], 32
+; CHECX-DAG: xvmaddadp 34, 35, 38
+; CHECX-DAG: stxvd2x 36, 0, 3
+; CHECX-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
+; CHECX-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
; CHECK: blr
; CHECK-FISL-LABEL: @testv2
diff --git a/test/CodeGen/PowerPC/vsx-fma-sp.ll b/test/CodeGen/PowerPC/vsx-fma-sp.ll
index 1c3e457f92cb..b4dd2e1627c4 100644
--- a/test/CodeGen/PowerPC/vsx-fma-sp.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-sp.ll
@@ -42,12 +42,13 @@ entry:
; CHECK-LABEL: @test2sp
; CHECK-DAG: li [[C1:[0-9]+]], 4
; CHECK-DAG: li [[C2:[0-9]+]], 8
-; CHECK-DAG: xsmaddmsp 3, 2, 1
-; CHECK-DAG: xsmaddmsp 4, 2, 1
-; CHECK-DAG: xsmaddasp 1, 2, 5
-; CHECK-DAG: stxsspx 3, 0, 8
-; CHECK-DAG: stxsspx 4, 8, [[C1]]
-; CHECK-DAG: stxsspx 1, 8, [[C2]]
+; FIXME: We now miss this because of copy ordering at the MI level.
+; CHECX-DAG: xsmaddmsp 3, 2, 1
+; CHECX-DAG: xsmaddmsp 4, 2, 1
+; CHECX-DAG: xsmaddasp 1, 2, 5
+; CHECX-DAG: stxsspx 3, 0, 8
+; CHECX-DAG: stxsspx 4, 8, [[C1]]
+; CHECX-DAG: stxsspx 1, 8, [[C2]]
; CHECK: blr
; CHECK-FISL-LABEL: @test2sp
diff --git a/test/CodeGen/SPARC/basictest.ll b/test/CodeGen/SPARC/basictest.ll
index 7b540074a35f..3792100b2e63 100644
--- a/test/CodeGen/SPARC/basictest.ll
+++ b/test/CodeGen/SPARC/basictest.ll
@@ -38,7 +38,7 @@ entry:
; CHECK-LABEL: signed_divide:
; CHECK: sra %o0, 31, %o2
-; CHECK: wr %o2, %g0, %y
+; CHECK: wr %g0, %o2, %y
; CHECK: sdiv %o0, %o1, %o0
define i32 @signed_divide(i32 %a, i32 %b) {
%r = sdiv i32 %a, %b
diff --git a/test/CodeGen/SPARC/multiple-div.ll b/test/CodeGen/SPARC/multiple-div.ll
new file mode 100644
index 000000000000..6934f69ac18c
--- /dev/null
+++ b/test/CodeGen/SPARC/multiple-div.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+; RUN: llc -O0 < %s -march=sparc | FileCheck %s
+
+;; llc -O0 used to try to spill Y to the stack, which isn't possible,
+;; and then crashed. Additionally, in -O1, it would omit the second
+;; apparently-redundant wr to %y, which is not actually redundant
+;; because the spec says to treat %y as potentially-written by udiv.
+
+; CHECK-LABEL: two_divides:
+; CHECK: wr %g0, %g0, %y
+; CHECK: udiv
+; CHECK: wr %g0, %g0, %y
+; CHECK: udiv
+; CHECK: add
+
+define i32 @two_divides(i32 %a, i32 %b) {
+ %r = udiv i32 %a, %b
+ %r2 = udiv i32 %b, %a
+ %r3 = add i32 %r, %r2
+ ret i32 %r3
+}
diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll
index 21af8c119b04..299562fe4c5c 100644
--- a/test/CodeGen/Thumb2/aapcs.ll
+++ b/test/CodeGen/Thumb2/aapcs.ll
@@ -33,8 +33,7 @@ define float @float_on_stack(double %a, double %b, double %c, double %d, double
define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
; CHECK-LABEL: double_on_stack:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
; HARD: vldr d0, [sp]
; CHECK-NEXT: bx lr
ret double %i
@@ -42,8 +41,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, doubl
define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
; CHECK-LABEL: double_not_split:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
; HARD: vldr d0, [sp]
; CHECK-NEXT: bx lr
ret double %i
diff --git a/test/CodeGen/WebAssembly/lit.local.cfg b/test/CodeGen/WebAssembly/lit.local.cfg
new file mode 100644
index 000000000000..743473517cd0
--- /dev/null
+++ b/test/CodeGen/WebAssembly/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'WebAssembly' in config.root.targets:
+ config.unsupported = True
diff --git a/test/CodeGen/WinEH/cppeh-alloca-sink.ll b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
index cc6cec9e4d69..f215dca2ddd3 100644
--- a/test/CodeGen/WinEH/cppeh-alloca-sink.ll
+++ b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
@@ -81,7 +81,7 @@ eh.resume: ; preds = %lpad
}
; CHECK-LABEL: define void @sink_alloca_to_catch()
-; CHECK: call void (...) @llvm.frameescape(i32* %only_used_in_catch)
+; CHECK: call void (...) @llvm.localescape(i32* %only_used_in_catch)
declare void @use_catch_var(i32*) #1
@@ -162,14 +162,14 @@ eh.resume: ; preds = %lpad1, %catch.dispa
}
; CHECK-LABEL: define void @dont_sink_alloca_to_catch(i32 %n)
-; CHECK: call void (...) @llvm.frameescape(i32* %live_in_out_catch)
+; CHECK: call void (...) @llvm.localescape(i32* %live_in_out_catch)
; CHECK-LABEL: define internal i8* @sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %only_used_in_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %only_used_in_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
; CHECK: %only_used_in_catch = bitcast
; CHECK-LABEL: define internal i8* @dont_sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %live_in_out_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %live_in_out_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
; CHECK: %live_in_out_catch = bitcast
diff --git a/test/CodeGen/WinEH/cppeh-catch-all-win32.ll b/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
new file mode 100644
index 000000000000..b2e84b90d69f
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
@@ -0,0 +1,86 @@
+; RUN: opt -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; extern "C" void may_throw();
+; extern "C" void handle_exception();
+; extern "C" void test() {
+; try {
+; may_throw();
+; } catch (...) {
+; handle_exception();
+; }
+; }
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; The function entry in this case remains unchanged.
+; CHECK: define void @test()
+; CHECK: entry:
+; CHECK: invoke void @may_throw()
+; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
+
+define void @test() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+ %exn.slot = alloca i8*
+ %ehselector.slot = alloca i32
+ invoke void @may_throw()
+ to label %invoke.cont unwind label %lpad
+
+invoke.cont: ; preds = %entry
+ br label %try.cont
+
+; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: catch i8* null
+; CHECK-NEXT: [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* ()* @test.catch)
+; CHECK-NEXT: indirectbr i8* [[RECOVER]], [label %try.cont]
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ store i8* %1, i8** %exn.slot
+ %2 = extractvalue { i8*, i32 } %0, 1
+ store i32 %2, i32* %ehselector.slot
+ br label %catch
+
+; CHECK-NOT: catch:
+; CHECK-NOT: @handle_exception()
+
+catch: ; preds = %lpad
+ %exn = load i8*, i8** %exn.slot
+ call void @llvm.eh.begincatch(i8* %exn, i8* null) #1
+ call void @handle_exception()
+ call void @llvm.eh.endcatch() #1
+ br label %try.cont
+
+try.cont: ; preds = %catch, %invoke.cont
+ ret void
+
+; CHECK: }
+}
+
+; CHECK: define internal i8* @test.catch()
+; CHECK: call i8* @llvm.frameaddress(i32 1)
+; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void ()* @test to i8*), i8* %{{.*}})
+; CHECK: call void @handle_exception()
+; CHECK: ret i8* blockaddress(@test, %try.cont)
+; CHECK: }
+
+
+declare void @may_throw() #0
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #1
+
+declare void @handle_exception() #0
+
+; Function Attrs: nounwind
+declare void @llvm.eh.endcatch() #1
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll b/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
index 240ca987690d..d604b86deb35 100644
--- a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
@@ -45,7 +45,7 @@ $_TI1H = comdat any
; This is just a minimal check to verify that main was handled by WinEHPrepare.
; CHECK: define void @"\01?test@@YAXXZ"()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape
+; CHECK: call void (...) @llvm.localescape
; CHECK: invoke void @_CxxThrowException
; CHECK: }
@@ -105,7 +105,7 @@ unreachable: ; preds = %catch, %entry
;
; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
; CHECK: entry:
-; CHECK: call i8* @llvm.framerecover
+; CHECK: call i8* @llvm.localrecover
; CHECK: call void @"\01??1Obj@@QEAA@XZ"
; CHECK: invoke void @llvm.donothing()
; CHECK: to label %[[SPLIT_LABEL:.+]] unwind label %[[LPAD_LABEL:.+]]
diff --git a/test/CodeGen/WinEH/cppeh-catch-scalar.ll b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
index 172502cf73c8..3b5ab746d63c 100644
--- a/test/CodeGen/WinEH/cppeh-catch-scalar.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-pc-windows-msvc"
; CHECK: define void @_Z4testv()
; CHECK: entry:
; CHECK: [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK: call void (...) @llvm.frameescape(i32* [[I_PTR]])
+; CHECK: call void (...) @llvm.localescape(i32* [[I_PTR]])
; CHECK: invoke void @_Z9may_throwv()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -96,7 +96,7 @@ eh.resume: ; preds = %catch.dispatch
; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
; CHECK: [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
; CHECK: [[TMP:\%.+]] = load i32, i32* [[I_PTR1]], align 4
; CHECK: call void @_Z10handle_inti(i32 [[TMP]])
diff --git a/test/CodeGen/WinEH/cppeh-catch-unwind.ll b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
index 6fd70d84b2af..8fdda9bbc02a 100644
--- a/test/CodeGen/WinEH/cppeh-catch-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
@@ -36,7 +36,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: [[OBJ_PTR:\%.+]] = alloca %class.SomeClass
; CHECK: [[TMP0:\%.+]] = alloca i32, align 4
; CHECK: [[TMP1:\%.+]] = alloca i32, align 4
-; CHECK: call void (...) @llvm.frameescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
+; CHECK: call void (...) @llvm.localescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
; CHECK: %call = invoke %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* %obj)
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -177,7 +177,7 @@ eh.resume: ; preds = %catch.dispatch7
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_TMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_TMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[TMP1_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP1]] to i32*
; CHECK: call void @"\01?handle_exception@@YAXXZ"()
; CHECK: ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont15)
@@ -185,7 +185,7 @@ eh.resume: ; preds = %catch.dispatch7
; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[OBJ_PTR:\%.+]] = bitcast i8* %obj.i8 to %class.SomeClass*
; CHECK: call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* [[OBJ_PTR]])
; CHECK: ret void
@@ -193,7 +193,7 @@ eh.resume: ; preds = %catch.dispatch7
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_TMP0:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_TMP0:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
; CHECK: [[TMP0_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP0]] to i32*
; CHECK: invoke void @"\01?handle_exception@@YAXXZ"()
; CHECK: to label %invoke.cont6 unwind label %[[LPAD5_LABEL:lpad[0-9]+]]
diff --git a/test/CodeGen/WinEH/cppeh-frame-vars.ll b/test/CodeGen/WinEH/cppeh-frame-vars.ll
index 1077ad0b8765..c2dbd8ecab60 100644
--- a/test/CodeGen/WinEH/cppeh-frame-vars.ll
+++ b/test/CodeGen/WinEH/cppeh-frame-vars.ll
@@ -58,7 +58,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: [[TMP:\%.+]] = bitcast %struct.SomeData* [[DATA_PTR]] to i8*
; CHECK: call void @llvm.memset(i8* [[TMP]], i8 0, i64 8, i32 4, i1 false)
; CHECK: store i32 0, i32* [[I_PTR]], align 4
-; CHECK: call void (...) @llvm.frameescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
+; CHECK: call void (...) @llvm.localescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
; CHECK: br label %for.cond
; Function Attrs: uwtable
@@ -198,15 +198,15 @@ eh.resume: ; preds = %catch.dispatch
; The following catch handler should be outlined.
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[E_PTR1:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK: [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[NUMEXCEPTIONS_PTR1:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK: [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
; CHECK: [[EXCEPTIONVAL_PTR1:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
; CHECK: [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK: [[RECOVER_DATA:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
+; CHECK: [[RECOVER_DATA:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
; CHECK: [[DATA_PTR1:\%.+]] = bitcast i8* [[RECOVER_DATA]] to %struct.SomeData*
; CHECK: [[TMP:\%.+]] = load i32, i32* [[E_PTR1]], align 4
; CHECK: [[TMP1:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_PTR]], align 4
diff --git a/test/CodeGen/WinEH/cppeh-inalloca.ll b/test/CodeGen/WinEH/cppeh-inalloca.ll
index 3dc1348efffa..649c5e72e2dd 100644
--- a/test/CodeGen/WinEH/cppeh-inalloca.ll
+++ b/test/CodeGen/WinEH/cppeh-inalloca.ll
@@ -41,7 +41,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: [[RETVAL:\%.+]] = alloca i32, align 4
; CHECK: [[E_PTR:\%.+]] = alloca i32, align 4
; CHECK: [[CLEANUP_SLOT:\%.+]] = alloca i32
-; CHECK: call void (...) @llvm.frameescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
+; CHECK: call void (...) @llvm.localescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
; CHECK: invoke void @"\01?may_throw@@YAXXZ"()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -139,13 +139,13 @@ eh.resume: ; preds = %ehcleanup
; The following catch handler should be outlined.
; CHECK: define internal i8* @"\01?test@@YAHUA@@@Z.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
; CHECK: [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK: [[RECOVER_EH_TEMP:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_EH_TEMP:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
; CHECK: [[EH_TEMP:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
-; CHECK: [[RECOVER_RETVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_RETVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 2)
; CHECK: [[RETVAL1:\%.+]] = bitcast i8* [[RECOVER_RETVAL]] to i32*
-; CHECK: [[RECOVER_CLEANUPSLOT:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 3)
+; CHECK: [[RECOVER_CLEANUPSLOT:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 3)
; CHECK: [[CLEANUPSLOT1:\%.+]] = bitcast i8* [[RECOVER_CLEANUPSLOT]] to i32*
; CHECK: [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
; CHECK: [[TMP_RELOAD:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP]]
@@ -162,7 +162,7 @@ eh.resume: ; preds = %ehcleanup
; The following cleanup handler should be outlined.
; CHECK: define internal void @"\01?test@@YAHUA@@@Z.cleanup"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
; CHECK: [[EH_TEMP1:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
; CHECK: [[TMP_RELOAD1:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP1]]
; CHECK: [[A3:\%.+]] = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* [[TMP_RELOAD1]], i32 0, i32 0
diff --git a/test/CodeGen/WinEH/cppeh-min-unwind.ll b/test/CodeGen/WinEH/cppeh-min-unwind.ll
index b1f157ade29b..98d6d6fcacb6 100644
--- a/test/CodeGen/WinEH/cppeh-min-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-min-unwind.ll
@@ -25,7 +25,7 @@ target triple = "x86_64-pc-windows-msvc"
; CHECK: entry:
; CHECK: [[OBJ_PTR:\%.+]] = alloca %class.SomeClass, align 4
; CHECK: call void @_ZN9SomeClassC1Ev(%class.SomeClass* [[OBJ_PTR]])
-; CHECK: call void (...) @llvm.frameescape(%class.SomeClass* [[OBJ_PTR]])
+; CHECK: call void (...) @llvm.localescape(%class.SomeClass* [[OBJ_PTR]])
; CHECK: invoke void @_Z9may_throwv()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -74,7 +74,7 @@ eh.resume: ; preds = %lpad
; This cleanup handler should be outlined.
; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
; CHECK: [[OBJ_PTR1:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass*
; CHECK: call void @_ZN9SomeClassD1Ev(%class.SomeClass* [[OBJ_PTR1]])
; CHECK: ret void
diff --git a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll b/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
index 1294d0b8ff30..c69633f17e28 100644
--- a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
+++ b/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
@@ -31,7 +31,7 @@ target triple = "x86_64-pc-windows-msvc"
;
; CHECK-LABEL: define void @"\01?test@@YAXXZ"()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape
+; CHECK: call void (...) @llvm.localescape
; CHECK: }
; Function Attrs: nounwind uwtable
@@ -67,7 +67,7 @@ try.cont: ; preds = %catch, %invoke.cont
; Verify that a cleanup handler was created and that it calls ~Obj().
; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
; CHECK: entry:
-; CHECK: @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: call void @"\01??1Obj@@QEAA@XZ"
; CHECK: ret void
; CHECK: }
diff --git a/test/CodeGen/WinEH/cppeh-multi-catch.ll b/test/CodeGen/WinEH/cppeh-multi-catch.ll
index 25224551cadc..266cdea20cdb 100644
--- a/test/CodeGen/WinEH/cppeh-multi-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-multi-catch.ll
@@ -50,7 +50,7 @@ $"\01??_R0?AVSomeClass@@@8" = comdat any
; CHECK: [[OBJ_PTR:\%.+]] = alloca %class.SomeClass*, align 8
; CHECK: [[LL_PTR:\%.+]] = alloca i64, align 8
; CHECK: [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK: call void (...) @llvm.frameescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
+; CHECK: call void (...) @llvm.localescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
; CHECK: invoke void @"\01?may_throw@@YAXXZ"()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -161,7 +161,7 @@ catch: ; preds = %catch.fallthrough2
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
; CHECK: [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
; CHECK: call void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
@@ -170,7 +170,7 @@ catch: ; preds = %catch.fallthrough2
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_LL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_LL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[LL_PTR:\%.+]] = bitcast i8* [[RECOVER_LL]] to i64*
; CHECK: [[TMP2:\%.+]] = load i64, i64* [[LL_PTR]], align 8
; CHECK: call void @"\01?handle_long_long@@YAX_J@Z"(i64 [[TMP2]])
@@ -179,7 +179,7 @@ catch: ; preds = %catch.fallthrough2
; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
; CHECK: [[OBJ_PTR:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass**
; CHECK: [[TMP3:\%.+]] = load %class.SomeClass*, %class.SomeClass** [[OBJ_PTR]], align 8
; CHECK: call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* [[TMP3]])
diff --git a/test/CodeGen/WinEH/cppeh-nested-1.ll b/test/CodeGen/WinEH/cppeh-nested-1.ll
index a5e80ac2b2ab..d525d8a1a67e 100644
--- a/test/CodeGen/WinEH/cppeh-nested-1.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-1.ll
@@ -34,7 +34,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: entry:
; CHECK: %i = alloca i32, align 4
; CHECK: %f = alloca float, align 4
-; CHECK: call void (...) @llvm.frameescape(float* %f, i32* %i)
+; CHECK: call void (...) @llvm.localescape(float* %f, i32* %i)
; CHECK: invoke void @"\01?may_throw@@YAXXZ"()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -136,7 +136,7 @@ eh.resume: ; %catch.dispatch3
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_F1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_F1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[F_PTR1:\%.+]] = bitcast i8* [[RECOVER_F1]] to float*
; CHECK: [[TMP2:\%.+]] = load float, float* [[F_PTR1]], align 4
; CHECK: call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
@@ -145,7 +145,7 @@ eh.resume: ; %catch.dispatch3
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
; CHECK: [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
; CHECK: invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
diff --git a/test/CodeGen/WinEH/cppeh-nested-2.ll b/test/CodeGen/WinEH/cppeh-nested-2.ll
index 385958b006d2..2764e7478c71 100644
--- a/test/CodeGen/WinEH/cppeh-nested-2.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-2.ll
@@ -44,7 +44,7 @@ target triple = "x86_64-pc-windows-msvc"
; CHECK: %inner = alloca %class.Inner, align 1
; CHECK: %i = alloca i32, align 4
; CHECK: %f = alloca float, align 4
-; CHECK: call void (...) @llvm.frameescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
+; CHECK: call void (...) @llvm.localescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
; CHECK: invoke void @_ZN5OuterC1Ev(%class.Outer* %outer)
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -243,7 +243,7 @@ eh.resume: ; preds = %catch.dispatch11
; This catch handler should be outlined.
; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
; CHECK: [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
; CHECK: [[TMP:\%.+]] = load float, float* [[F_PTR]], align 4
; CHECK: call void @_Z12handle_floatf(float [[TMP]])
@@ -253,7 +253,7 @@ eh.resume: ; preds = %catch.dispatch11
; This catch handler should be outlined.
; CHECK: define internal i8* @_Z4testv.catch.1(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
; CHECK: [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
; CHECK: [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
; CHECK: invoke void @_Z10handle_inti(i32 [[TMP1]])
@@ -270,7 +270,7 @@ eh.resume: ; preds = %catch.dispatch11
; This cleanup handler should be outlined.
; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_OUTER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_OUTER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
; CHECK: [[OUTER_PTR:\%.+]] = bitcast i8* [[RECOVER_OUTER]] to %class.Outer*
; CHECK: call void @_ZN5OuterD1Ev(%class.Outer* [[OUTER_PTR]])
; CHECK: ret void
@@ -279,7 +279,7 @@ eh.resume: ; preds = %catch.dispatch11
; This cleanup handler should be outlined.
; CHECK: define internal void @_Z4testv.cleanup.2(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_INNER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
+; CHECK: [[RECOVER_INNER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
; CHECK: [[INNER_PTR:\%.+]] = bitcast i8* [[RECOVER_INNER]] to %class.Inner*
; CHECK: call void @_ZN5InnerD1Ev(%class.Inner* [[INNER_PTR]])
; CHECK: ret void
diff --git a/test/CodeGen/WinEH/cppeh-nested-3.ll b/test/CodeGen/WinEH/cppeh-nested-3.ll
index 33faaf0f591a..88759f406fb1 100644
--- a/test/CodeGen/WinEH/cppeh-nested-3.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-3.ll
@@ -41,7 +41,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: %i = alloca i32, align 4
; CHECK: %j = alloca i32, align 4
; CHECK: %f = alloca float, align 4
-; CHECK: call void (...) @llvm.frameescape(i32* %j, i32* %i, float* %f)
+; CHECK: call void (...) @llvm.localescape(i32* %j, i32* %i, float* %f)
; CHECK: invoke void @"\01?may_throw@@YAXXZ"()
; CHECK: to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
@@ -181,9 +181,9 @@ eh.resume: ; preds = %lpad16, %catch.disp
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_J:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_J:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[J_PTR:\%.+]] = bitcast i8* [[RECOVER_J]] to i32*
-; CHECK: [[RECOVER_I1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_I1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I1]] to i32*
; CHECK: [[TMP3:\%.+]] = load i32, i32* [[J_PTR]], align 4
; CHECK: store i32 [[TMP3]], i32* [[I_PTR1]]
@@ -192,7 +192,7 @@ eh.resume: ; preds = %lpad16, %catch.disp
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
; CHECK: [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
; CHECK: [[TMP2:\%.+]] = load float, float* [[F_PTR]], align 4
; CHECK: call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
@@ -201,7 +201,7 @@ eh.resume: ; preds = %lpad16, %catch.disp
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
; CHECK: invoke void @"\01?may_throw@@YAXXZ"()
; CHECK: to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
diff --git a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll b/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
index 14a5f233f9ba..53f532c8eb16 100644
--- a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
@@ -53,7 +53,7 @@ $_TI1H = comdat any
; CHECK-LABEL: define void @"\01?test1@@YAXXZ"()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape
+; CHECK: call void (...) @llvm.localescape
; Function Attrs: nounwind uwtable
define void @"\01?test1@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
@@ -121,7 +121,7 @@ declare void @llvm.eh.endcatch() #1
; CHECK-LABEL: define void @"\01?test2@@YAXXZ"()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape
+; CHECK: call void (...) @llvm.localescape
; Function Attrs: nounwind uwtable
define void @"\01?test2@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
diff --git a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
index 83236c4188ff..7b474c9d38a3 100644
--- a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
+++ b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
@@ -68,7 +68,7 @@ $"\01??_R0H@8" = comdat any
; CHECK: store i32* [[A_PTR]], i32** [[A_REGMEM]]
; CHECK: [[B_PTR:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* [[TMPCAST]], i64 0, i32 1
; CHECK: store i32* [[B_PTR]], i32** [[B_REGMEM]]
-; CHECK: call void (...) @llvm.frameescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* %inc.reg2mem, i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
+; CHECK: call void (...) @llvm.localescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* %inc.reg2mem, i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
; CHECK: br label %for.body
; Function Attrs: uwtable
@@ -192,19 +192,19 @@ eh.resume: ; preds = %lpad
; The following catch handler should be outlined.
; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
; CHECK: entry:
-; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK: [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
; CHECK: [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK: [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK: [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
; CHECK: [[NUMEXCEPTIONS_REGMEM:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK: [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK: [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
; CHECK: [[EXCEPTIONVAL:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK: [[RECOVER_INC:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
+; CHECK: [[RECOVER_INC:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
; CHECK: [[INC_REGMEM:\%.+]] = bitcast i8* [[RECOVER_INC]] to i32*
-; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
+; CHECK: [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
; CHECK: [[I_REGMEM:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK: [[RECOVER_A:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 5)
+; CHECK: [[RECOVER_A:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 5)
; CHECK: [[A_REGMEM:\%.+]] = bitcast i8* [[RECOVER_A]] to i32**
-; CHECK: [[RECOVER_B:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 6)
+; CHECK: [[RECOVER_B:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 6)
; CHECK: [[B_REGMEM:\%.+]] = bitcast i8* [[RECOVER_B]] to i32**
; CHECK: [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
; CHECK: [[TMP:\%.+]] = load i32, i32* [[E_PTR]], align 4
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
index fc632af17405..2d31a1d5cf4f 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
@@ -49,7 +49,7 @@ entry:
%e = alloca i32, align 4
%0 = bitcast i32* %tmp.i to i8*
store i32 42, i32* %tmp.i, align 4, !tbaa !2
- call void (...) @llvm.frameescape(i32* %e)
+ call void (...) @llvm.localescape(i32* %e)
invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #6
to label %.noexc unwind label %lpad1
@@ -92,7 +92,7 @@ declare i8* @llvm.eh.actions(...) #3
define internal i8* @main.catch(i8*, i8*) #5 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %e.i8 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
+ %e.i8 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
%e = bitcast i8* %e.i8 to i32*
%2 = bitcast i32* %e to i8*
%3 = load i32, i32* %e, align 4, !tbaa !2
@@ -114,6 +114,7 @@ stub: ; preds = %entry
; CHECK: .seh_handlerdata
; CHECK: .long ($cppxdata$main)@IMGREL
+; CHECK: .align 4
; CHECK-NEXT: $cppxdata$main:
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 2
@@ -139,10 +140,10 @@ stub: ; preds = %entry
declare void @llvm.donothing() #2
; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
attributes #0 = { noreturn uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="main" }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch.ll b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
index 02cc682cbe4b..a5d86dceea93 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
; This test case is equivalent to:
; void f() {
@@ -32,7 +30,7 @@ $"\01??_R0H@8" = comdat any
define internal i8* @"\01?f@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
+ %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
%bc2 = bitcast i8* %.i8 to i32**
%bc3 = bitcast i32** %bc2 to i8*
invoke void @"\01?may_throw@@YAXXZ"()
@@ -51,14 +49,14 @@ lpad1: ; preds = %entry
; CHECK-LABEL: "?f@@YAXXZ.catch":
; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK: .seh_handlerdata
-; CHECK: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NOT: jmp{{[ql]}} *
+; X64: .seh_handlerdata
+; X64-NEXT: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
define internal i8* @"\01?f@@YAXXZ.catch1"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
+ %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
%2 = bitcast i8* %.i8 to double*
%3 = bitcast double* %2 to i8*
invoke void () @llvm.donothing()
@@ -76,11 +74,11 @@ lpad: ; preds = %entry
; CHECK-LABEL: "?f@@YAXXZ.catch1":
; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
-; CHECK: movq %rdx, 16(%rsp)
-; CHECK: .seh_handlerdata
-; CHECK: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NOT: jmp{{[ql]}} *
+; X64: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
+; X64: movq %rdx, 16(%rsp)
+; X64: .seh_handlerdata
+; X64: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
define void @"\01?f@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
@@ -88,7 +86,7 @@ entry:
%ehselector.slot = alloca i32
%0 = alloca i32*, align 8
%1 = alloca double, align 8
- call void (...) @llvm.frameescape(i32** %0, double* %1)
+ call void (...) @llvm.localescape(i32** %0, double* %1)
invoke void @"\01?may_throw@@YAXXZ"()
to label %invoke.cont unwind label %lpad2
@@ -118,20 +116,38 @@ try.cont8: ; preds = %lpad2, %try.cont
; CHECK-LABEL: "?f@@YAXXZ":
; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK: .seh_handlerdata
-; CHECK-NEXT: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT:"$cppxdata$?f@@YAXXZ":
-; CHECK-NEXT: .long 429065506
-; CHECK-NEXT: .long 4
-; CHECK-NEXT: .long ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT: .long 2
-; CHECK-NEXT: .long ("$tryMap$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT: .long 6
-; CHECK-NEXT: .long ("$ip2state$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT: .long 32
-; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long 1
+; CHECK-NOT: jmp{{[ql]}} *
+
+; X64: .seh_handlerdata
+; X64-NEXT: .long ("$cppxdata$?f@@YAXXZ")@IMGREL
+; X86: .section .xdata,"dr"
+
+; CHECK: .align 4
+
+; X64: "$cppxdata$?f@@YAXXZ":
+; X64-NEXT: .long 429065506
+; X64-NEXT: .long 4
+; X64-NEXT: .long ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
+; X64-NEXT: .long 2
+; X64-NEXT: .long ("$tryMap$?f@@YAXXZ")@IMGREL
+; X64-NEXT: .long 6
+; X64-NEXT: .long ("$ip2state$?f@@YAXXZ")@IMGREL
+; X64-NEXT: .long 32
+; X64-NEXT: .long 0
+; X64-NEXT: .long 1
+
+; X86: "L__ehtable$?f@@YAXXZ":
+; X86-NEXT: .long 429065506
+; X86-NEXT: .long 4
+; X86-NEXT: .long ("$stateUnwindMap$?f@@YAXXZ")
+; X86-NEXT: .long 2
+; X86-NEXT: .long ("$tryMap$?f@@YAXXZ")
+; X86-NEXT: .long 0
+; X86-NEXT: .long 0
+; X86-NEXT: .long 0
+; X86-NEXT: .long 1
+
+
; CHECK-NEXT:"$stateUnwindMap$?f@@YAXXZ":
; CHECK-NEXT: .long -1
; CHECK-NEXT: .long 0
@@ -146,37 +162,43 @@ try.cont8: ; preds = %lpad2, %try.cont
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long 2
; CHECK-NEXT: .long 1
-; CHECK-NEXT: .long ("$handlerMap$0$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT: .long ("$handlerMap$0$?f@@YAXXZ")
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 2
; CHECK-NEXT: .long 3
; CHECK-NEXT: .long 1
-; CHECK-NEXT: .long ("$handlerMap$1$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT: .long ("$handlerMap$1$?f@@YAXXZ")
; CHECK-NEXT:"$handlerMap$0$?f@@YAXXZ":
; CHECK-NEXT: .long 8
-; CHECK-NEXT: .long "??_R0H@8"@IMGREL
-; CHECK-NEXT: .long ".L?f@@YAXXZ$frame_escape_0"
-; CHECK-NEXT: .long "?f@@YAXXZ.catch"@IMGREL
-; CHECK-NEXT: .long ".L?f@@YAXXZ.catch$parent_frame_offset"
+; CHECK-NEXT: .long "??_R0H@8"
+; CHECK-NEXT: .long "{{.?}}L?f@@YAXXZ$frame_escape_0"
+; CHECK-NEXT: .long "?f@@YAXXZ.catch"
+; X64-NEXT: .long ".L?f@@YAXXZ.catch$parent_frame_offset"
; CHECK-NEXT:"$handlerMap$1$?f@@YAXXZ":
; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long "??_R0N@8"@IMGREL
-; CHECK-NEXT: .long ".L?f@@YAXXZ$frame_escape_1"
-; CHECK-NEXT: .long "?f@@YAXXZ.catch1"@IMGREL
-; CHECK-NEXT: .long ".L?f@@YAXXZ.catch1$parent_frame_offset"
-; CHECK-NEXT:"$ip2state$?f@@YAXXZ":
-; CHECK-NEXT: .long .Lfunc_begin0@IMGREL
-; CHECK-NEXT: .long 2
-; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long .Lfunc_begin1@IMGREL
-; CHECK-NEXT: .long 3
-; CHECK-NEXT: .long .Lfunc_begin2@IMGREL
-; CHECK-NEXT: .long -1
-; CHECK-NEXT: .long .Ltmp13@IMGREL
-; CHECK-NEXT: .long 1
-; CHECK-NEXT: .long .Ltmp16@IMGREL
-; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long "??_R0N@8"
+; CHECK-NEXT: .long "{{.?}}L?f@@YAXXZ$frame_escape_1"
+; CHECK-NEXT: .long "?f@@YAXXZ.catch1"
+; X64-NEXT: .long ".L?f@@YAXXZ.catch1$parent_frame_offset"
+
+; X64-NEXT:"$ip2state$?f@@YAXXZ":
+; X64-NEXT: .long .Lfunc_begin0
+; X64-NEXT: .long 2
+; X64-NEXT: .long .Ltmp0
+; X64-NEXT: .long 0
+; X64-NEXT: .long .Lfunc_begin1
+; X64-NEXT: .long 3
+; X64-NEXT: .long .Lfunc_begin2
+; X64-NEXT: .long -1
+; X64-NEXT: .long .Ltmp13
+; X64-NEXT: .long 1
+; X64-NEXT: .long .Ltmp16
+; X64-NEXT: .long 0
+
+
+; X86: "___ehhandler$?f@@YAXXZ": # @"__ehhandler$?f@@YAXXZ"
+; X86: movl $"L__ehtable$?f@@YAXXZ", %eax
+; X86: jmp ___CxxFrameHandler3 # TAILCALL
declare void @"\01?may_throw@@YAXXZ"() #1
@@ -196,10 +218,10 @@ declare void @llvm.eh.endcatch() #3
declare i8* @llvm.eh.actions(...) #3
; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
declare void @llvm.donothing()
diff --git a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
index 14973023356a..b5cfd65030ab 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
@@ -30,6 +30,7 @@ $_TI1H = comdat any
; CHECK-LABEL: "?test1@@YAXXZ":
; CHECK: .seh_handlerdata
; CHECK-NEXT: .long ("$cppxdata$?test1@@YAXXZ")@IMGREL
+; CHECK-NEXT: .align 4
; CHECK-NEXT:"$cppxdata$?test1@@YAXXZ":
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 1
@@ -58,7 +59,7 @@ entry:
%ehselector.slot = alloca i32
store i32 0, i32* %tmp
%0 = bitcast i32* %tmp to i8*
- call void (...) @llvm.frameescape()
+ call void (...) @llvm.localescape()
store volatile i64 -2, i64* %unwindhelp
%1 = bitcast i64* %unwindhelp to i8*
call void @llvm.eh.unwindhelp(i8* %1)
@@ -92,6 +93,7 @@ entry:
; CHECK-LABEL: "?test2@@YAX_N@Z":
; CHECK: .seh_handlerdata
; CHECK-NEXT: .long ("$cppxdata$?test2@@YAX_N@Z")@IMGREL
+; CHECK-NEXT: .align 4
; CHECK-NEXT:"$cppxdata$?test2@@YAX_N@Z":
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 2
@@ -126,7 +128,7 @@ define void @"\01?test2@@YAX_N@Z"(i1 zeroext %b) #2 personality i8* bitcast (i32
%s1 = alloca %struct.S, align 1
%frombool = zext i1 %b to i8
store i8 %frombool, i8* %b.addr, align 1
- call void (...) @llvm.frameescape(%struct.S* %s, %struct.S* %s1)
+ call void (...) @llvm.localescape(%struct.S* %s, %struct.S* %s1)
call void @"\01?may_throw@@YAXXZ"()
invoke void @"\01?may_throw@@YAXXZ"()
to label %invoke.cont unwind label %lpad1
@@ -188,17 +190,17 @@ entry:
}
; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #4
+declare void @llvm.localescape(...) #4
; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #6
+declare i8* @llvm.localrecover(i8*, i8*, i32) #6
; Function Attrs: nounwind
declare void @llvm.eh.unwindhelp(i8*) #4
define internal void @"\01?test2@@YAX_N@Z.cleanup"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %s.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
+ %s.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
%s = bitcast i8* %s.i8 to %struct.S*
call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
invoke void @llvm.donothing()
@@ -215,7 +217,7 @@ stub: ; preds = %entry
define internal void @"\01?test2@@YAX_N@Z.cleanup1"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %s1.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
+ %s1.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
%s1 = bitcast i8* %s1.i8 to %struct.S*
call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
invoke void @llvm.donothing()
diff --git a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll b/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
index 678ea6f8ba13..87ccc9d9dedd 100644
--- a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
@@ -30,7 +30,7 @@ $"\01??_R0H@8" = comdat any
; CHECK-LABEL: define void @"\01?f@@YAXXZ"()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape()
+; CHECK: call void (...) @llvm.localescape()
; CHECK: invoke void @"\01?g@@YAXXZ"()
; Function Attrs: nounwind
diff --git a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll b/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
index 5b974508bc11..092135368158 100644
--- a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
+++ b/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
@@ -86,7 +86,7 @@ $"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@" = comdat any
; This is just a minimal check to verify that main was handled by WinEHPrepare.
; CHECK: define i32 @main()
; CHECK: entry:
-; CHECK: call void (...) @llvm.frameescape(i32* [[X_PTR:\%.+]], i32* [[X2_PTR:\%.+]], i8* [[C2_PTR:\%.+]], i8* [[C3_PTR:\%.+]], i8* [[C_PTR:\%.+]])
+; CHECK: call void (...) @llvm.localescape(i32* [[X_PTR:\%.+]], i32* [[X2_PTR:\%.+]], i8* [[C2_PTR:\%.+]], i8* [[C3_PTR:\%.+]], i8* [[C_PTR:\%.+]])
; CHECK: invoke void @_CxxThrowException
; CHECK: }
diff --git a/test/CodeGen/WinEH/cppeh-state-calc-1.ll b/test/CodeGen/WinEH/cppeh-state-calc-1.ll
index 1e71f8f38271..abc5d5292cf7 100644
--- a/test/CodeGen/WinEH/cppeh-state-calc-1.ll
+++ b/test/CodeGen/WinEH/cppeh-state-calc-1.ll
@@ -79,7 +79,7 @@ entry:
call void @"\01?two@@YAXXZ"() #3
store i32 2, i32* %tmp
%0 = bitcast i32* %tmp to i8*
- call void (...) @llvm.frameescape(i32* %x, i8* %c, i32* %x21)
+ call void (...) @llvm.localescape(i32* %x, i8* %c, i32* %x21)
invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #5
to label %unreachable unwind label %lpad
@@ -166,7 +166,7 @@ declare i8* @llvm.eh.actions(...) #3
define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %x.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+ %x.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
%x = bitcast i8* %x.i8 to i32*
%2 = bitcast i32* %x to i8*
call void @"\01?catch_two@@YAXXZ"() #3
@@ -204,7 +204,7 @@ stub: ; preds = %entry
define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
entry:
- %x21.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+ %x21.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
%x21 = bitcast i8* %x21.i8 to i32*
%2 = bitcast i32* %x21 to i8*
call void @"\01?catch_one@@YAXXZ"() #3
@@ -238,10 +238,10 @@ stub: ; preds = %entry
}
; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test@@YAXXZ" }
attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-exception-code.ll b/test/CodeGen/WinEH/seh-exception-code.ll
new file mode 100644
index 000000000000..2998e7982133
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-exception-code.ll
@@ -0,0 +1,66 @@
+; RUN: opt -winehprepare -S < %s | FileCheck %s
+
+; WinEHPrepare was crashing during phi demotion.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+declare i32 @__C_specific_handler(...)
+
+@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+declare void @maycrash()
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...)
+
+; Function Attrs: nounwind uwtable
+define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+ invoke void @maycrash()
+ to label %invoke.cont unwind label %lpad
+
+invoke.cont: ; preds = %entry
+ invoke void @maycrash()
+ to label %__try.cont unwind label %lpad.1
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ br label %__except
+
+lpad.1: ; preds = %invoke.cont, %lpad
+ %2 = landingpad { i8*, i32 }
+ catch i8* null
+ %3 = extractvalue { i8*, i32 } %2, 0
+ br label %__except
+
+__except: ; preds = %lpad, %lpad.1
+ %exn.slot.0 = phi i8* [ %3, %lpad.1 ], [ %1, %lpad ]
+ %4 = ptrtoint i8* %exn.slot.0 to i64
+ %5 = trunc i64 %4 to i32
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %5)
+ br label %__try.cont
+
+__try.cont: ; preds = %invoke.cont, %__except
+ ret void
+}
+
+; CHECK-LABEL: define void @doit()
+; CHECK: landingpad
+; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split1:.*]]]
+; CHECK: [[except_split1]]:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: br label %__except
+;
+; CHECK: landingpad
+; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split2:.*]]]
+; CHECK: [[except_split2]]:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: br label %__except
+;
+; CHECK: __except:
+; CHECK: phi
+; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-exception-code2.ll b/test/CodeGen/WinEH/seh-exception-code2.ll
new file mode 100644
index 000000000000..0356956502c0
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-exception-code2.ll
@@ -0,0 +1,91 @@
+; RUN: opt -winehprepare -S < %s | FileCheck %s
+
+; WinEHPrepare was crashing during phi demotion.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+declare i32 @__C_specific_handler(...)
+
+@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
+
+declare void @maycrash()
+declare void @finally(i1 %abnormal)
+declare i32 @printf(i8* nocapture readonly, ...)
+declare i32 @llvm.eh.typeid.for(i8*)
+
+; Function Attrs: nounwind uwtable
+define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+ invoke void @maycrash()
+ to label %invoke.cont unwind label %lpad.1
+
+invoke.cont: ; preds = %entry
+ invoke void @maycrash()
+ to label %__try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %lp0 = landingpad { i8*, i32 }
+ cleanup
+ catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
+ %ehptr.0 = extractvalue { i8*, i32 } %lp0, 0
+ %ehsel.0 = extractvalue { i8*, i32 } %lp0, 1
+ call void @finally(i1 true)
+ br label %ehdispatch
+
+lpad.1: ; preds = %invoke.cont, %lpad
+ %lp1 = landingpad { i8*, i32 }
+ catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
+ %ehptr.1 = extractvalue { i8*, i32 } %lp1, 0
+ %ehsel.1 = extractvalue { i8*, i32 } %lp1, 1
+ br label %ehdispatch
+
+ehdispatch:
+ %ehptr.2 = phi i8* [ %ehptr.0, %lpad ], [ %ehptr.1, %lpad.1 ]
+ %ehsel.2 = phi i32 [ %ehsel.0, %lpad ], [ %ehsel.1, %lpad.1 ]
+ %mysel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*))
+ %matches = icmp eq i32 %ehsel.2, %mysel
+ br i1 %matches, label %__except, label %eh.resume
+
+__except: ; preds = %lpad, %lpad.1
+ %t4 = ptrtoint i8* %ehptr.2 to i64
+ %t5 = trunc i64 %t4 to i32
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %t5)
+ br label %__try.cont
+
+__try.cont: ; preds = %invoke.cont, %__except
+ call void @finally(i1 false)
+ ret void
+
+eh.resume:
+ %ehvals0 = insertvalue { i8*, i32 } undef, i8* %ehptr.2, 0
+ %ehvals = insertvalue { i8*, i32 } %ehvals0, i32 %ehsel.2, 1
+ resume { i8*, i32 } %ehvals
+}
+
+define internal i32 @"\01?filt$0@0@doit@@"(i8* %exception_pointers, i8* %frame_pointer) #1 {
+entry:
+ %0 = bitcast i8* %exception_pointers to { i32*, i8* }*
+ %1 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %0, i32 0, i32 0
+ %2 = load i32*, i32** %1
+ %3 = load i32, i32* %2
+ %cmp = icmp eq i32 %3, -1073741819
+ %4 = zext i1 %cmp to i32
+ ret i32 %4
+}
+
+; CHECK-LABEL: define void @doit()
+; CHECK: %lp0 = landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: catch i8*
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
+; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
+;
+; CHECK: %lp1 = landingpad { i8*, i32 }
+; CHECK-NEXT: catch i8*
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
+; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
+;
+; CHECK: __except:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-inlined-finally.ll b/test/CodeGen/WinEH/seh-inlined-finally.ll
index 5943cb77cee2..157adf0c8183 100644
--- a/test/CodeGen/WinEH/seh-inlined-finally.ll
+++ b/test/CodeGen/WinEH/seh-inlined-finally.ll
@@ -13,9 +13,9 @@ target triple = "x86_64-pc-windows-msvc"
declare i32 @puts(i8*)
declare void @may_crash()
declare i32 @__C_specific_handler(...)
-declare i8* @llvm.framerecover(i8*, i8*, i32) #1
-declare i8* @llvm.frameaddress(i32)
-declare void @llvm.frameescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32) #1
+declare i8* @llvm.localaddress()
+declare void @llvm.localescape(...)
declare dllimport void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION*)
declare dllimport void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION*)
@@ -47,14 +47,14 @@ lpad: ; preds = %entry
define i32 @call_may_crash_locked() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
entry:
%p = alloca %struct._RTL_CRITICAL_SECTION, align 8
- call void (...) @llvm.frameescape(%struct._RTL_CRITICAL_SECTION* %p)
+ call void (...) @llvm.localescape(%struct._RTL_CRITICAL_SECTION* %p)
call void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION* %p)
invoke void @may_crash()
to label %invoke.cont unwind label %lpad
invoke.cont: ; preds = %entry
- %tmp2 = call i8* @llvm.frameaddress(i32 0)
- %tmp3 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp2, i32 0) #2
+ %tmp2 = call i8* @llvm.localaddress()
+ %tmp3 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp2, i32 0) #2
%tmp6 = bitcast i8* %tmp3 to %struct._RTL_CRITICAL_SECTION*
call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp6)
ret i32 42
@@ -62,8 +62,8 @@ invoke.cont: ; preds = %entry
lpad: ; preds = %entry
%tmp7 = landingpad { i8*, i32 }
cleanup
- %tmp8 = call i8* @llvm.frameaddress(i32 0)
- %tmp9 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp8, i32 0)
+ %tmp8 = call i8* @llvm.localaddress()
+ %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp8, i32 0)
%tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
resume { i8*, i32 } %tmp7
@@ -78,6 +78,6 @@ lpad: ; preds = %entry
; CHECK-NEXT: indirectbr i8* %recover, []
; CHECK-LABEL: define internal void @call_may_crash_locked.cleanup(i8*, i8*)
-; CHECK: %tmp9 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %1, i32 0)
+; CHECK: %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %1, i32 0)
; CHECK: %tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
; CHECK: call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
diff --git a/test/CodeGen/WinEH/seh-outlined-finally-win32.ll b/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
new file mode 100644
index 000000000000..3649433c4b61
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
@@ -0,0 +1,172 @@
+; RUN: opt -S -winehprepare < %s | FileCheck %s
+
+; Test case based on this code:
+;
+; extern "C" int _abnormal_termination();
+; #pragma intrinsic(_abnormal_termination)
+; extern "C" int printf(const char *, ...);
+; extern "C" void may_crash() {
+; *(volatile int *)0 = 42;
+; }
+; int main() {
+; int myres = 0;
+; __try {
+; __try {
+; may_crash();
+; } __finally {
+; printf("inner finally %d\n", _abnormal_termination());
+; may_crash();
+; }
+; } __finally {
+; printf("outer finally %d\n", _abnormal_termination());
+; }
+; }
+;
+; Note that if the inner finally crashes, the outer finally still runs. There
+; is nothing like a std::terminate call in this situation.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = comdat any
+
+$"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = comdat any
+
+@"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"outer finally %d\0A\00", comdat, align 1
+@"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"inner finally %d\0A\00", comdat, align 1
+
+; Function Attrs: nounwind
+define void @may_crash() #0 {
+entry:
+ store volatile i32 42, i32* null, align 4
+ ret void
+}
+
+; Function Attrs: nounwind
+define i32 @main() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+ %myres = alloca i32, align 4
+ %exn.slot = alloca i8*
+ %ehselector.slot = alloca i32
+ store i32 0, i32* %myres, align 4
+ invoke void @may_crash() #4
+ to label %invoke.cont unwind label %lpad
+
+invoke.cont: ; preds = %entry
+ %0 = call i8* @llvm.frameaddress(i32 0)
+ invoke void @"\01?fin$1@0@main@@"(i8 zeroext 0, i8* %0) #4
+ to label %invoke.cont.2 unwind label %lpad.1
+
+invoke.cont.2: ; preds = %invoke.cont
+ %1 = call i8* @llvm.frameaddress(i32 0)
+ call void @"\01?fin$0@0@main@@"(i8 zeroext 0, i8* %1)
+ ret i32 0
+
+lpad: ; preds = %entry
+ %2 = landingpad { i8*, i32 }
+ cleanup
+ %3 = extractvalue { i8*, i32 } %2, 0
+ store i8* %3, i8** %exn.slot
+ %4 = extractvalue { i8*, i32 } %2, 1
+ store i32 %4, i32* %ehselector.slot
+ %5 = call i8* @llvm.frameaddress(i32 0)
+ invoke void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %5) #4
+ to label %invoke.cont.3 unwind label %lpad.1
+
+lpad.1: ; preds = %lpad, %invoke.cont
+ %6 = landingpad { i8*, i32 }
+ cleanup
+ %7 = extractvalue { i8*, i32 } %6, 0
+ store i8* %7, i8** %exn.slot
+ %8 = extractvalue { i8*, i32 } %6, 1
+ store i32 %8, i32* %ehselector.slot
+ br label %ehcleanup
+
+invoke.cont.3: ; preds = %lpad
+ br label %ehcleanup
+
+ehcleanup: ; preds = %invoke.cont.3, %lpad.1
+ %9 = call i8* @llvm.frameaddress(i32 0)
+ call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %9)
+ br label %eh.resume
+
+eh.resume: ; preds = %ehcleanup
+ %exn = load i8*, i8** %exn.slot
+ %sel = load i32, i32* %ehselector.slot
+ %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+ %lpad.val.4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+ resume { i8*, i32 } %lpad.val.4
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke void @may_crash()
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup)
+; CHECK-NEXT: indirectbr
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup.1)
+; CHECK-NEXT: indirectbr
+
+; CHECK-LABEL: define internal void @main.cleanup()
+; CHECK: call i8* @llvm.frameaddress(i32 1)
+; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
+; CHECK: call void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+
+; CHECK-LABEL: define internal void @main.cleanup.1()
+; CHECK: call i8* @llvm.frameaddress(i32 1)
+; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
+; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+
+; Function Attrs: noinline nounwind
+define internal void @"\01?fin$0@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
+entry:
+ %frame_pointer.addr = alloca i8*, align 4
+ %abnormal_termination.addr = alloca i8, align 1
+ %0 = call i8* @llvm.frameaddress(i32 1)
+ %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
+ store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
+ store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
+ %2 = zext i8 %abnormal_termination to i32
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32) #2
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) #2
+
+declare i32 @printf(i8*, ...) #3
+
+; Function Attrs: noinline nounwind
+define internal void @"\01?fin$1@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
+entry:
+ %frame_pointer.addr = alloca i8*, align 4
+ %abnormal_termination.addr = alloca i8, align 1
+ %0 = call i8* @llvm.frameaddress(i32 1)
+ %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
+ store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
+ store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
+ %2 = zext i8 %abnormal_termination to i32
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
+ call void @may_crash()
+ ret void
+}
+
+declare i32 @_except_handler3(...)
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { noinline }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 "}
diff --git a/test/CodeGen/WinEH/seh-outlined-finally.ll b/test/CodeGen/WinEH/seh-outlined-finally.ll
index 3c27212192dd..529f85b9602b 100644
--- a/test/CodeGen/WinEH/seh-outlined-finally.ll
+++ b/test/CodeGen/WinEH/seh-outlined-finally.ll
@@ -49,12 +49,12 @@ entry:
to label %invoke.cont unwind label %lpad
invoke.cont: ; preds = %entry
- %0 = call i8* @llvm.frameaddress(i32 0)
+ %0 = call i8* @llvm.localaddress()
invoke void @"\01?fin$1@0@main@@"(i1 zeroext false, i8* %0) #4
to label %invoke.cont2 unwind label %lpad1
invoke.cont2: ; preds = %invoke.cont
- %1 = call i8* @llvm.frameaddress(i32 0)
+ %1 = call i8* @llvm.localaddress()
call void @"\01?fin$0@0@main@@"(i1 zeroext false, i8* %1)
ret i32 0
@@ -65,7 +65,7 @@ lpad: ; preds = %entry
store i8* %3, i8** %exn.slot
%4 = extractvalue { i8*, i32 } %2, 1
store i32 %4, i32* %ehselector.slot
- %5 = call i8* @llvm.frameaddress(i32 0)
+ %5 = call i8* @llvm.localaddress()
invoke void @"\01?fin$1@0@main@@"(i1 zeroext true, i8* %5) #4
to label %invoke.cont3 unwind label %lpad1
@@ -82,7 +82,7 @@ invoke.cont3: ; preds = %lpad
br label %ehcleanup
ehcleanup: ; preds = %invoke.cont3, %lpad1
- %9 = call i8* @llvm.frameaddress(i32 0)
+ %9 = call i8* @llvm.localaddress()
call void @"\01?fin$0@0@main@@"(i1 zeroext true, i8* %9)
br label %eh.resume
@@ -146,7 +146,7 @@ entry:
declare i32 @__C_specific_handler(...)
; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32) #3
+declare i8* @llvm.localaddress() #3
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-prepared-basic.ll b/test/CodeGen/WinEH/seh-prepared-basic.ll
index b981dc2d9bd8..b6a30309f1c1 100644
--- a/test/CodeGen/WinEH/seh-prepared-basic.ll
+++ b/test/CodeGen/WinEH/seh-prepared-basic.ll
@@ -17,7 +17,7 @@ target triple = "x86_64-pc-windows-msvc"
; Function Attrs: uwtable
define void @do_except() #0 personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
entry:
- call void (...) @llvm.frameescape()
+ call void (...) @llvm.localescape()
invoke void @g() #5
to label %__try.cont unwind label %lpad1
@@ -64,10 +64,10 @@ declare i32 @llvm.eh.typeid.for(i8*) #3
declare i8* @llvm.eh.actions(...) #4
; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #4
+declare void @llvm.localescape(...) #4
; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #3
+declare i8* @llvm.localrecover(i8*, i8*, i32) #3
attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="do_except" }
attributes #1 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-simple.ll b/test/CodeGen/WinEH/seh-simple.ll
index 98f06ef12c9f..060186484aec 100644
--- a/test/CodeGen/WinEH/seh-simple.ll
+++ b/test/CodeGen/WinEH/seh-simple.ll
@@ -107,6 +107,38 @@ eh.resume:
; CHECK-NEXT: %r = phi i32 [ 0, %entry ], [ 1, %lpad.return_crit_edge ]
; CHECK-NEXT: ret i32 %r
+define i32 @except_join() personality i32 (...)* @__C_specific_handler {
+entry:
+ invoke void @might_crash()
+ to label %return unwind label %lpad
+
+lpad:
+ %ehvals = landingpad { i8*, i32 }
+ catch i32 ()* @filt
+ %sel = extractvalue { i8*, i32 } %ehvals, 1
+ %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
+ %matches = icmp eq i32 %sel, %filt_sel
+ br i1 %matches, label %return, label %eh.resume
+
+return:
+ ret i32 0
+
+eh.resume:
+ resume { i8*, i32 } %ehvals
+}
+
+; CHECK-LABEL: define i32 @except_join()
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: catch i32 ()* @filt
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@except_join, %lpad.return_crit_edge))
+; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
+;
+; CHECK: lpad.return_crit_edge:
+; CHECK: br label %return
+;
+; CHECK: return:
+; CHECK-NEXT: ret i32 0
+
define i32 @lpad_phi() personality i32 (...)* @__C_specific_handler {
entry:
invoke void @might_crash()
@@ -196,6 +228,6 @@ eh.resume:
; X64-LABEL: define internal void @lpad_phi.cleanup(i8*, i8*)
; X86-LABEL: define internal void @lpad_phi.cleanup()
; X86: call i8* @llvm.frameaddress(i32 1)
-; CHECK: call i8* @llvm.framerecover({{.*}})
+; CHECK: call i8* @llvm.localrecover({{.*}})
; CHECK: load i32
; CHECK: store i32 %{{.*}}, i32*
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 74d20f348b52..4e43f6f51921 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -269,7 +269,7 @@ entry:
define <4 x double> @vperm2z_0x08(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x08:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x double> %s
@@ -279,7 +279,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x18:
; ALL: # BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
ret <4 x double> %s
@@ -288,7 +288,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
define <4 x double> @vperm2z_0x28(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x28:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %s
@@ -298,7 +298,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x38:
; ALL: # BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
ret <4 x double> %s
@@ -307,7 +307,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
define <4 x double> @vperm2z_0x80(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x80:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %s
@@ -316,7 +316,7 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
define <4 x double> @vperm2z_0x81(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x81:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x double> %s
@@ -325,7 +325,7 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
define <4 x double> @vperm2z_0x82(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x82:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x double> %s
@@ -334,7 +334,7 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
define <4 x double> @vperm2z_0x83(<4 x double> %a) {
; ALL-LABEL: vperm2z_0x83:
; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
ret <4 x double> %s
@@ -345,8 +345,8 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
; ALL-LABEL: vperm2z_int_0x83:
; ALL: # BB#0:
-; AVX1: vperm2f128 $129, %ymm0, %ymm0, %ymm0
-; AVX2: vperm2i128 $129, %ymm0, %ymm0, %ymm0
+; AVX1: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX2: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
%s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
%c = add <4 x i64> %b, %s
ret <4 x i64> %c
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index b9f490b8a39a..7642cd4e6c5c 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -406,20 +406,6 @@ define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2
}
declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
- define <8 x i32> @test_cvtpd2udq(<8 x double> %a) {
- ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0]
- %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %a, <8 x i32>zeroinitializer, i8 -1, i32 2)
- ret <8 x i32>%res
- }
- declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
-
- define <16 x i32> @test_cvtps2udq(<16 x float> %a) {
- ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0]
- %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1)
- ret <16 x i32>%res
- }
- declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
-
define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
@@ -434,35 +420,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
- ; cvt intrinsics
- define <16 x float> @test_cvtdq2ps(<16 x i32> %a) {
- ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0]
- %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
- ret <16 x float>%res
- }
- declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
- define <16 x float> @test_cvtudq2ps(<16 x i32> %a) {
- ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0]
- %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
- ret <16 x float>%res
- }
- declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
- define <8 x double> @test_cvtdq2pd(<8 x i32> %a) {
- ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0]
- %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
- ret <8 x double>%res
- }
- declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
-
- define <8 x double> @test_cvtudq2pd(<8 x i32> %a) {
- ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0]
- %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
- ret <8 x double>%res
- }
- declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
-
; fp min - max
define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
; CHECK: vmaxpd
@@ -482,13 +439,6 @@ define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
<8 x double>, i8, i32)
- define <8 x float> @test_cvtpd2ps(<8 x double> %a) {
- ;CHECK: vcvtpd2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0xfd,0x38,0x5a,0xc0]
- %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %a, <8 x float>zeroinitializer, i8 -1, i32 1)
- ret <8 x float>%res
- }
- declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
-
declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 9574c016ad50..71bf63ed44d0 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -997,3 +997,44 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
ret <64 x i8> %res2
}
+declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 0119d3945f4e..f5413896789a 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -3763,3 +3763,83 @@ define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16>
ret <16 x i16> %res2
}
+declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: {%k1}
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
diff --git a/test/CodeGen/X86/cppeh-nounwind.ll b/test/CodeGen/X86/cppeh-nounwind.ll
new file mode 100644
index 000000000000..d9bc001a92df
--- /dev/null
+++ b/test/CodeGen/X86/cppeh-nounwind.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
+
+; Sometimes invokes of nounwind functions make it through to CodeGen, especially
+; at -O0, where Clang sometimes optimistically annotates functions as nounwind.
+; WinEHPrepare ends up outlining functions, and emitting references to LSDA
+; labels. Make sure we emit the LSDA in that case.
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @nounwind_func() nounwind
+declare void @cleanup()
+
+define void @should_emit_tables() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ invoke void @nounwind_func()
+ to label %done unwind label %lpad
+
+done:
+ ret void
+
+lpad:
+ %vals = landingpad { i8*, i32 }
+ cleanup
+ call void @cleanup()
+ resume { i8*, i32 } %vals
+}
+
+; CHECK: _should_emit_tables:
+; CHECK: calll _nounwind_func
+; CHECK: retl
+
+; CHECK: L__ehtable$should_emit_tables:
+
+; CHECK: ___ehhandler$should_emit_tables:
+; CHECK: movl $L__ehtable$should_emit_tables, %eax
+; CHECK: jmp ___CxxFrameHandler3 # TAILCALL
diff --git a/test/CodeGen/X86/eh-nolandingpads.ll b/test/CodeGen/X86/eh-nolandingpads.ll
new file mode 100644
index 000000000000..962952266214
--- /dev/null
+++ b/test/CodeGen/X86/eh-nolandingpads.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+; Test that we emit functions with explicitly specified personality,
+; even if no landing pads are left.
+
+declare i32 @__my_personality_v0(...)
+declare void @might_throw()
+
+define i32 @foo() personality i32 (...)* @__my_personality_v0 {
+; CHECK: .cfi_personality 3, __my_personality_v0
+ call void @might_throw()
+ ret i32 0
+}
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index 279bb0624ace..34eac62e3673 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
; Anything more than one division using a single divisor operand
; should be converted into a reciprocal and multiplication.
@@ -17,9 +17,9 @@ define float @div2_arcp(float %x, float %y, float %z) #0 {
; CHECK: # BB#0:
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm2, %xmm3
-; CHECK-NEXT: mulss %xmm3, %xmm0
; CHECK-NEXT: mulss %xmm1, %xmm0
; CHECK-NEXT: mulss %xmm3, %xmm0
+; CHECK-NEXT: mulss %xmm3, %xmm0
; CHECK-NEXT: retq
%div1 = fdiv arcp float %x, %z
%mul = fmul arcp float %div1, %y
@@ -27,5 +27,22 @@ define float @div2_arcp(float %x, float %y, float %z) #0 {
ret float %div2
}
+; If the reciprocal is already calculated, we should not
+; generate an extra multiplication by 1.0.
+
+define double @div3_arcp(double %x, double %y, double %z) #0 {
+; CHECK-LABEL: div3_arcp:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd{{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: divsd %xmm1, %xmm2
+; CHECK-NEXT: mulsd %xmm2, %xmm0
+; CHECK-NEXT: addsd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %div1 = fdiv fast double 1.0, %y
+ %div2 = fdiv fast double %x, %y
+ %ret = fadd fast double %div2, %div1
+ ret double %ret
+}
+
; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll
index 00bc55d24878..179a936304ba 100644
--- a/test/CodeGen/X86/frameescape.ll
+++ b/test/CodeGen/X86/frameescape.ll
@@ -1,19 +1,19 @@
; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
-declare void @llvm.frameescape(...)
+declare void @llvm.localescape(...)
declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
declare i32 @printf(i8*, ...)
@str = internal constant [10 x i8] c"asdf: %d\0A\00"
define void @print_framealloc_from_fp(i8* %fp) {
- %a.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
+ %a.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
%a = bitcast i8* %a.i8 to i32*
%a.val = load i32, i32* %a
call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
- %b.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
+ %b.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
%b = bitcast i8* %b.i8 to i32*
%b.val = load i32, i32* %b
call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
@@ -61,7 +61,7 @@ define void @print_framealloc_from_fp(i8* %fp) {
define void @alloc_func() {
%a = alloca i32
%b = alloca i32, i32 2
- call void (...) @llvm.frameescape(i32* %a, i32* %b)
+ call void (...) @llvm.localescape(i32* %a, i32* %b)
store i32 42, i32* %a
store i32 13, i32* %b
%fp = call i8* @llvm.frameaddress(i32 0)
@@ -105,7 +105,7 @@ define i32 @main() {
define void @alloc_func_no_frameaddr() {
%a = alloca i32
%b = alloca i32
- call void (...) @llvm.frameescape(i32* %a, i32* %b)
+ call void (...) @llvm.localescape(i32* %a, i32* %b)
store i32 42, i32* %a
store i32 13, i32* %b
call void @print_framealloc_from_fp(i8* null)
diff --git a/test/CodeGen/X86/frameregister.ll b/test/CodeGen/X86/frameregister.ll
new file mode 100644
index 000000000000..826bb9d78c9d
--- /dev/null
+++ b/test/CodeGen/X86/frameregister.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT
+
+define i64 @get_frame() #0 {
+entry:
+; CHECK-LABEL: get_frame:
+; CHECK: movq %rbp, %rax
+ %sp = call i64 @llvm.read_register.i64(metadata !0)
+; OPT: @llvm.read_register.i64
+ ret i64 %sp
+}
+
+define void @set_frame(i64 %val) #0 {
+entry:
+; CHECK-LABEL: set_frame:
+; CHECK: movq %rdi, %rbp
+ call void @llvm.write_register.i64(metadata !0, i64 %val)
+; OPT: @llvm.write_register.i64
+ ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("rbp");
+; CHECK-NOT: .asciz "rbp"
+!0 = !{!"rbp\00"}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll
index 8fbed9f7bee8..c8d425c3889f 100644
--- a/test/CodeGen/X86/implicit-null-check-negative.ll
+++ b/test/CodeGen/X86/implicit-null-check-negative.ll
@@ -51,4 +51,46 @@ define i32 @imp_null_check_load_no_md(i32* %x) {
ret i32 %t
}
+define i32 @imp_null_check_no_hoist_over_acquire_load(i32* %x, i32* %y) {
+; We cannot hoist %t1 over %t0 since %t0 is an acquire load
+ entry:
+ %c = icmp eq i32* %x, null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ %t0 = load atomic i32, i32* %y acquire, align 4
+ %t1 = load i32, i32* %x
+ %p = add i32 %t0, %t1
+ ret i32 %p
+}
+
+define i32 @imp_null_check_add_result(i32* %x, i32* %y) {
+; This will codegen to:
+;
+; movl (%rsi), %eax
+; addl (%rdi), %eax
+;
+; The load instruction we wish to hoist is the addl, but there is a
+; write-after-write hazard preventing that from happening. We could
+; get fancy here and exploit the commutativity of addition, but right
+; now -implicit-null-checks isn't that smart.
+;
+
+ entry:
+ %c = icmp eq i32* %x, null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ %t0 = load i32, i32* %y
+ %t1 = load i32, i32* %x
+ %p = add i32 %t0, %t1
+ ret i32 %p
+}
+
!0 = !{}
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index 1d1b36bbd5d0..fd7a902eefc1 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -76,6 +76,31 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
ret i32 %p1
}
+define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
+; CHECK-LABEL: _imp_null_check_hoist_over_unrelated_load:
+; CHECK: Ltmp7:
+; CHECK: movl (%rdi), %eax
+; CHECK: movl (%rsi), %ecx
+; CHECK: movl %ecx, (%rdx)
+; CHECK: retq
+; CHECK: Ltmp6:
+; CHECK: movl $42, %eax
+; CHECK: retq
+
+ entry:
+ %c = icmp eq i32* %x, null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ %t0 = load i32, i32* %y
+ %t1 = load i32, i32* %x
+ store i32 %t0, i32* %z
+ ret i32 %t1
+}
+
!0 = !{}
; CHECK-LABEL: __LLVM_FaultMaps:
@@ -88,7 +113,7 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; CHECK-NEXT: .short 0
; # functions:
-; CHECK-NEXT: .long 3
+; CHECK-NEXT: .long 4
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_add_result
@@ -117,6 +142,19 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; CHECK-NEXT: .long Ltmp2-_imp_null_check_gep_load
; FunctionAddr:
+; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load
+; NumFaultingPCs
+; CHECK-NEXT: .long 1
+; Reserved:
+; CHECK-NEXT: .long 0
+; Fault[0].Type:
+; CHECK-NEXT: .long 1
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long Ltmp7-_imp_null_check_hoist_over_unrelated_load
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long Ltmp6-_imp_null_check_hoist_over_unrelated_load
+
+; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_load
; NumFaultingPCs
; CHECK-NEXT: .long 1
@@ -131,10 +169,12 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; OBJDUMP: FaultMap table:
; OBJDUMP-NEXT: Version: 0x1
-; OBJDUMP-NEXT: NumFunctions: 3
+; OBJDUMP-NEXT: NumFunctions: 4
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3
diff --git a/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
new file mode 100644
index 000000000000..91b1ffed4e0f
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
@@ -0,0 +1,10 @@
+; RUN: not llc -march=x86 -no-integrated-as < %s 2>&1 | FileCheck %s
+
+@x = global i32 0, align 4
+
+;CHECK: error: invalid operand for inline asm constraint 'n'
+define void @foo() {
+ %a = getelementptr i32, i32* @x, i32 1
+ call void asm sideeffect "foo $0", "n"(i32* %a) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/legalize-shl-vec.ll b/test/CodeGen/X86/legalize-shl-vec.ll
new file mode 100644
index 000000000000..7ec2a663513f
--- /dev/null
+++ b/test/CodeGen/X86/legalize-shl-vec.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define <2 x i256> @test_shl(<2 x i256> %In) {
+ %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+ %Out = shl <2 x i256> %In, %Amt
+ ret <2 x i256> %Out
+
+ ; CHECK-LABEL: test_shl
+ ; CHECK: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK: retq
+}
+
+define <2 x i256> @test_srl(<2 x i256> %In) {
+ %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+ %Out = lshr <2 x i256> %In, %Amt
+ ret <2 x i256> %Out
+
+ ; CHECK-LABEL: test_srl
+ ; CHECK: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK-NEXT: movq $0
+ ; CHECK: retq
+}
+
+define <2 x i256> @test_sra(<2 x i256> %In) {
+ %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+ %Out = ashr <2 x i256> %In, %Amt
+ ret <2 x i256> %Out
+
+ ; CHECK-LABEL: test_sra
+ ; CHECK: sarq $63
+}
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll
index d4cd59ffac3a..0943bebbb099 100644
--- a/test/CodeGen/X86/machine-combiner.ll
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -1,15 +1,23 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
; Verify that the first two adds are independent regardless of how the inputs are
; commuted. The destination registers are used as source registers for the third add.
define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds1:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds1:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fadd float %x0, %x1
%t1 = fadd float %t0, %x2
%t2 = fadd float %t1, %x3
@@ -17,12 +25,19 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
}
define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds2:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds2:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fadd float %x0, %x1
%t1 = fadd float %x2, %t0
%t2 = fadd float %t1, %x3
@@ -30,12 +45,19 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
}
define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds3:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds3:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fadd float %x0, %x1
%t1 = fadd float %t0, %x2
%t2 = fadd float %x3, %t1
@@ -43,12 +65,19 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
}
define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds4:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds4:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fadd float %x0, %x1
%t1 = fadd float %x2, %t0
%t2 = fadd float %x3, %t1
@@ -59,16 +88,27 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
; produced because that would cost more compile time.
define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT: vaddss %xmm6, %xmm1, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm7, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds5:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: addss %xmm5, %xmm4
+; SSE-NEXT: addss %xmm6, %xmm4
+; SSE-NEXT: addss %xmm4, %xmm0
+; SSE-NEXT: addss %xmm7, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds5:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm1
+; AVX-NEXT: vaddss %xmm6, %xmm1, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm7, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fadd float %x0, %x1
%t1 = fadd float %t0, %x2
%t2 = fadd float %t1, %x3
@@ -83,17 +123,90 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa
; Also, we should reassociate such that the result of the high latency division
; is used by the final 'add' rather than reassociating the %x3 operand with the
; division. The latter reassociation would not improve anything.
-
+
define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds6:
-; CHECK: # BB#0:
-; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; SSE-LABEL: reassociate_adds6:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds6:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%t0 = fdiv float %x0, %x1
%t1 = fadd float %x2, %t0
%t2 = fadd float %x3, %t1
ret float %t2
}
+; Verify that SSE and AVX scalar single-precison multiplies are reassociated.
+
+define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
+; SSE-LABEL: reassociate_muls1:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm3, %xmm2
+; SSE-NEXT: mulss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_muls1:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %t0 = fdiv float %x0, %x1
+ %t1 = fmul float %x2, %t0
+ %t2 = fmul float %x3, %t1
+ ret float %t2
+}
+
+; Verify that SSE and AVX scalar double-precison adds are reassociated.
+
+define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_adds_double:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm3, %xmm2
+; SSE-NEXT: addsd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_adds_double:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %t0 = fdiv double %x0, %x1
+ %t1 = fadd double %x2, %t0
+ %t2 = fadd double %x3, %t1
+ ret double %t2
+}
+
+; Verify that SSE and AVX scalar double-precison multiplies are reassociated.
+
+define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_muls_double:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm3, %xmm2
+; SSE-NEXT: mulsd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reassociate_muls_double:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %t0 = fdiv double %x0, %x1
+ %t1 = fmul double %x2, %t0
+ %t2 = fmul double %x3, %t1
+ ret double %t2
+}
+
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index faaec262cb91..a6b721a7a6f1 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -1,5 +1,20 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64-darwin | FileCheck %s
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 4286578688
+; CHECK-LABEL: LCPI0_1:
+; CHECK-NEXT: .long 2139095040
+
+; CHECK-LABEL: foo:
+; CHECK: movq {{.*}}, %rax
+; CHECK: shlq $48, %rax
+; CHECK: sets %al
+; CHECK: testb %al, %al
+; CHECK: flds LCPI0_0(%rip)
+; CHECK: flds LCPI0_1(%rip)
+; CHECK: fcmovne %st(1), %st(0)
+; CHECK: fstp %st(1)
+; CHECK: retq
define x86_fp80 @foo(x86_fp80 %a) {
%1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
ret x86_fp80 %1
diff --git a/test/CodeGen/X86/read-fp-no-frame-pointer.ll b/test/CodeGen/X86/read-fp-no-frame-pointer.ll
new file mode 100644
index 000000000000..9f78c294ce88
--- /dev/null
+++ b/test/CodeGen/X86/read-fp-no-frame-pointer.ll
@@ -0,0 +1,12 @@
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_frame() nounwind {
+entry:
+; CHECK: register ebp is allocatable: function has no frame pointer
+ %fp = call i32 @llvm.read_register.i32(metadata !0)
+ ret i32 %fp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = !{!"ebp\00"}
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index 423b9914e99d..a4ea8ab78c79 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -10,14 +10,14 @@ declare void @crash()
declare i32 @printf(i8* nocapture readonly, ...) nounwind
declare i32 @llvm.eh.typeid.for(i8*)
declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
-declare void @llvm.frameescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
entry:
%__exceptioncode = alloca i32, align 4
- call void (...) @llvm.frameescape(i32* %__exceptioncode)
+ call void (...) @llvm.localescape(i32* %__exceptioncode)
invoke void @crash() #5
to label %__try.cont unwind label %lpad
@@ -45,7 +45,7 @@ define internal i32 @"filt$main"() {
entry:
%ebp = tail call i8* @llvm.frameaddress(i32 1)
%parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
- %code.i8 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+ %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
%__exceptioncode = bitcast i8* %code.i8 to i32*
%info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
%0 = bitcast i8* %info.addr to i32***
@@ -59,26 +59,38 @@ entry:
; Check that we can get the exception code from eax to the printf.
; CHECK-LABEL: _main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; Ensure that we push *all* the CSRs, since they are clobbered by the
+; __except block.
+; CHECK: pushl %ebx
+; CHECK: pushl %edi
+; CHECK: pushl %esi
+
; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
; CHECK: movl %esp, [[reg_offs]](%ebp)
; CHECK: movl $L__ehtable$main,
; EH state 0
-; CHECK: movl $0, -4(%ebp)
+; CHECK: movl $0, -16(%ebp)
; CHECK: calll _crash
+; CHECK: popl %esi
+; CHECK: popl %edi
+; CHECK: popl %ebx
; CHECK: retl
; CHECK: # Block address taken
; stackrestore
-; CHECK: movl [[reg_offs]](%ebp), %esp
+; CHECK: movl -24(%ebp), %esp
; EH state -1
; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]]
-; CHECK: movl $-1, -4(%ebp)
+; CHECK: movl $-1, -16(%ebp)
; CHECK-DAG: movl %[[code]], 4(%esp)
; CHECK-DAG: movl $_str, (%esp)
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: .align 4
; CHECK: L__ehtable$main
; CHECK-NEXT: .long -1
; CHECK-NEXT: .long _filt$main
diff --git a/test/CodeGen/X86/seh-except-finally.ll b/test/CodeGen/X86/seh-except-finally.ll
index 4327a64468f9..0630d001bb76 100644
--- a/test/CodeGen/X86/seh-except-finally.ll
+++ b/test/CodeGen/X86/seh-except-finally.ll
@@ -41,7 +41,7 @@ entry:
to label %invoke.cont unwind label %lpad
invoke.cont: ; preds = %entry
- %0 = call i8* @llvm.frameaddress(i32 0)
+ %0 = call i8* @llvm.localaddress()
invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext false, i8* %0) #5
to label %invoke.cont2 unwind label %lpad1
@@ -56,7 +56,7 @@ lpad: ; preds = %entry
store i8* %2, i8** %exn.slot
%3 = extractvalue { i8*, i32 } %1, 1
store i32 %3, i32* %ehselector.slot
- %4 = call i8* @llvm.frameaddress(i32 0)
+ %4 = call i8* @llvm.localaddress()
invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext true, i8* %4) #5
to label %invoke.cont3 unwind label %lpad1
@@ -153,7 +153,7 @@ declare i32 @puts(i8*) #3
declare i32 @__C_specific_handler(...)
; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32) #4
+declare i8* @llvm.localaddress() #4
; Function Attrs: nounwind readnone
declare i32 @llvm.eh.typeid.for(i8*) #4
diff --git a/test/CodeGen/X86/seh-stack-realign-win32.ll b/test/CodeGen/X86/seh-stack-realign-win32.ll
new file mode 100644
index 000000000000..f3ab71803ca7
--- /dev/null
+++ b/test/CodeGen/X86/seh-stack-realign-win32.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+
+; 32-bit catch-all has to use a filter function because that's how it saves the
+; exception code.
+
+@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
+
+declare i32 @_except_handler3(...)
+declare void @crash()
+declare i32 @printf(i8* nocapture readonly, ...) nounwind
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+ ; The EH code allocation is overaligned, triggering realignment.
+ %__exceptioncode = alloca i32, align 8
+ call void (...) @llvm.localescape(i32* %__exceptioncode)
+ invoke void @crash() #5
+ to label %__try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ catch i8* bitcast (i32 ()* @"filt$main" to i8*)
+ %1 = extractvalue { i8*, i32 } %0, 1
+ %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
+ %matches = icmp eq i32 %1, %2
+ br i1 %matches, label %__except, label %eh.resume
+
+__except: ; preds = %lpad
+ %3 = load i32, i32* %__exceptioncode, align 4
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
+ br label %__try.cont
+
+__try.cont: ; preds = %entry, %__except
+ ret i32 0
+
+eh.resume: ; preds = %lpad
+ resume { i8*, i32 } %0
+}
+
+define internal i32 @"filt$main"() {
+entry:
+ %ebp = tail call i8* @llvm.frameaddress(i32 1)
+ %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+ %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+ %__exceptioncode = bitcast i8* %code.i8 to i32*
+ %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %0 = bitcast i8* %info.addr to i32***
+ %1 = load i32**, i32*** %0, align 4
+ %2 = load i32*, i32** %1, align 4
+ %3 = load i32, i32* %2, align 4
+ store i32 %3, i32* %__exceptioncode, align 4
+ ret i32 1
+}
+
+; Check that we can get the exception code from eax to the printf.
+
+; CHECK-LABEL: _main:
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
+; CHECK: movl %esp, [[reg_offs]](%esi)
+; CHECK: movl $L__ehtable$main,
+; EH state 0
+; CHECK: movl $0, 40(%esi)
+; CHECK: calll _crash
+; CHECK: retl
+; CHECK: # Block address taken
+; stackrestore
+; CHECK: movl -24(%ebp), %esp
+; CHECK: movl $Lmain$parent_frame_offset, %eax
+; CHECK: negl %eax
+; CHECK: leal -24(%ebp,%eax), %esi
+; CHECK: movl 12(%esi), %ebp # 4-byte Reload
+; EH state -1
+; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
+; CHECK: movl $-1, 40(%esi)
+; CHECK-DAG: movl %[[code]], 4(%esp)
+; CHECK-DAG: movl $_str, (%esp)
+; CHECK: calll _printf
+
+; CHECK: .section .xdata,"dr"
+; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: L__ehtable$main
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .long _filt$main
+; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+
+; CHECK-LABEL: _filt$main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
+; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
+; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
+; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
+; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll
new file mode 100644
index 000000000000..f2fb28a081f9
--- /dev/null
+++ b/test/CodeGen/X86/seh-stack-realign.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+
+; 32-bit catch-all has to use a filter function because that's how it saves the
+; exception code.
+
+@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
+
+declare i32 @_except_handler3(...)
+declare void @crash()
+declare i32 @printf(i8* nocapture readonly, ...) nounwind
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+ ; The EH code allocation is overaligned, triggering realignment.
+ %__exceptioncode = alloca i32, align 8
+ call void (...) @llvm.localescape(i32* %__exceptioncode)
+ invoke void @crash() #5
+ to label %__try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ catch i8* bitcast (i32 ()* @"filt$main" to i8*)
+ %1 = extractvalue { i8*, i32 } %0, 1
+ %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
+ %matches = icmp eq i32 %1, %2
+ br i1 %matches, label %__except, label %eh.resume
+
+__except: ; preds = %lpad
+ %3 = load i32, i32* %__exceptioncode, align 4
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
+ br label %__try.cont
+
+__try.cont: ; preds = %entry, %__except
+ ret i32 0
+
+eh.resume: ; preds = %lpad
+ resume { i8*, i32 } %0
+}
+
+define internal i32 @"filt$main"() {
+entry:
+ %ebp = tail call i8* @llvm.frameaddress(i32 1)
+ %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+ %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+ %__exceptioncode = bitcast i8* %code.i8 to i32*
+ %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %0 = bitcast i8* %info.addr to i32***
+ %1 = load i32**, i32*** %0, align 4
+ %2 = load i32*, i32** %1, align 4
+ %3 = load i32, i32* %2, align 4
+ store i32 %3, i32* %__exceptioncode, align 4
+ ret i32 1
+}
+
+; Check that we can get the exception code from eax to the printf.
+
+; CHECK-LABEL: _main:
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
+; CHECK: movl %esp, [[reg_offs]](%esi)
+; CHECK: movl $L__ehtable$main,
+; EH state 0
+; CHECK: movl $0, 40(%esi)
+; CHECK: calll _crash
+; CHECK: retl
+; CHECK: # Block address taken
+; Restore ESP
+; CHECK: movl -24(%ebp), %esp
+; Restore ESI
+; CHECK: movl $Lmain$parent_frame_offset, %eax
+; CHECK: negl %eax
+; CHECK: leal -24(%ebp,%eax), %esi
+; Restore EBP
+; CHECK: movl 12(%esi), %ebp # 4-byte Reload
+; EH state -1
+; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
+; CHECK: movl $-1, 40(%esi)
+; CHECK-DAG: movl %[[code]], 4(%esp)
+; CHECK-DAG: movl $_str, (%esp)
+; CHECK: calll _printf
+
+; CHECK: .section .xdata,"dr"
+; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: L__ehtable$main
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .long _filt$main
+; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+
+; CHECK-LABEL: _filt$main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
+; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
+; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
+; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
+; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 0f8d9f4d713f..9b851db8121c 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -34,11 +34,11 @@ define float @ff(float %f) #0 {
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm3
+; ESTIMATE-NEXT: vmulss %xmm3, %xmm1, %xmm1
; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm2, %xmm2
; ESTIMATE-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm1
; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0
@@ -78,7 +78,7 @@ define float @reciprocal_square_root(float %x) #0 {
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index 7c8d5e578898..45028cf4bd37 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
; SSE2 Logical Shift Left
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index c1cd91beaf53..398675276c66 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -1,6 +1,6 @@
; These are tests for SSE3 codegen.
-; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64
; Test for v8xi16 lowering where we extract the first element of the vector and
; placed it in the second element of the result.
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index c7c1fc946386..63aa742bdf01 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1409,12 +1409,26 @@ define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
}
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-; TODO stack_fold_roundsd
+define double @stack_fold_roundsd(double %a0) optsize {
+ ;CHECK-LABEL: stack_fold_roundsd
+ ;CHECK: vroundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call double @llvm.floor.f64(double %a0)
+ ret double %2
+}
+declare double @llvm.floor.f64(double) nounwind readnone
; TODO stack_fold_roundsd_int
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-; TODO stack_fold_roundss
+define float @stack_fold_roundss(float %a0) optsize {
+ ;CHECK-LABEL: stack_fold_roundss
+ ;CHECK: vroundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call float @llvm.floor.f32(float %a0)
+ ret float %2
+}
+declare float @llvm.floor.f32(float) nounwind readnone
; TODO stack_fold_roundss_int
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 63acf5f4f96f..f9fcbaabdebb 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -884,11 +884,29 @@ define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
-; TODO stack_fold_roundsd
+define double @stack_fold_roundsd(double %a0) optsize {
+ ;CHECK-LABEL: stack_fold_roundsd
+ ;CHECK: roundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call double @llvm.floor.f64(double %a0)
+ ret double %2
+}
+declare double @llvm.floor.f64(double) nounwind readnone
+
; TODO stack_fold_roundsd_int
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define float @stack_fold_roundss(float %a0) optsize {
+ ;CHECK-LABEL: stack_fold_roundss
+ ;CHECK: roundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call float @llvm.floor.f32(float %a0)
+ ret float %2
+}
+declare float @llvm.floor.f32(float) nounwind readnone
-; TODO stack_fold_roundss
; TODO stack_fold_roundss_int
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
; TODO stack_fold_rsqrtps
@@ -938,13 +956,25 @@ define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-; TODO stack_fold_sqrtsd
+define double @stack_fold_sqrtsd(double %a0) optsize {
+ ;CHECK-LABEL: stack_fold_sqrtsd
+ ;CHECK: sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call double @llvm.sqrt.f64(double %a0)
+ ret double %2
+}
declare double @llvm.sqrt.f64(double) nounwind readnone
; TODO stack_fold_sqrtsd_int
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-; TODO stack_fold_sqrtss
+define float @stack_fold_sqrtss(float %a0) optsize {
+ ;CHECK-LABEL: stack_fold_sqrtss
+ ;CHECK: sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call float @llvm.sqrt.f32(float %a0)
+ ret float %2
+}
declare float @llvm.sqrt.f32(float) nounwind readnone
; TODO stack_fold_sqrtss_int
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 3e72212d85d3..3b1b2f5c1c77 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1,5 +1,10 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -mtriple=i686-unknown-unknown
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
;
; Double to Signed Integer
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index ca8be65075b9..4a3d08813904 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -mtriple=i686-unknown-unknown
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
;
; Signed Integer to Double
@@ -279,36 +284,19 @@ define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
; SSE2-LABEL: uitofp_2vf64_i16:
; SSE2: # BB#0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT: subpd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: addpd %xmm4, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: subpd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT: pand .LCPI10_0(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: uitofp_2vf64_i16:
; AVX: # BB#0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: vpand .LCPI10_0(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%cvt = uitofp <2 x i16> %shuf to <2 x double>
@@ -318,37 +306,20 @@ define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
; SSE2-LABEL: uitofp_2vf64_i8:
; SSE2: # BB#0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT: subpd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: addpd %xmm4, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: subpd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pand .LCPI11_0(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: uitofp_2vf64_i8:
; AVX: # BB#0:
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: vpand .LCPI11_0(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%cvt = uitofp <2 x i8> %shuf to <2 x double>
@@ -493,34 +464,11 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
; SSE2-LABEL: uitofp_4vf64_i16:
; SSE2: # BB#0:
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT: subpd %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT: addpd %xmm5, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; SSE2-NEXT: addpd %xmm1, %xmm5
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand .LCPI14_2(%rip), %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
-; SSE2-NEXT: addpd %xmm5, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: uitofp_4vf64_i16:
@@ -536,38 +484,13 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
; SSE2-LABEL: uitofp_4vf64_i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT: subpd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT: addpd %xmm5, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
-; SSE2-NEXT: addpd %xmm4, %xmm5
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand .LCPI15_2(%rip), %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
-; SSE2-NEXT: addpd %xmm4, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
-; SSE2-NEXT: addpd %xmm5, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: uitofp_4vf64_i8:
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index ce98e6759b65..47878360ca0a 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -92,3 +92,25 @@ entry:
;CHECK: ret
}
+;CHECK-LABEL: AGEP7:
+define <4 x i8*> @AGEP7(<4 x i8*> %param, i32 %off) nounwind {
+entry:
+;CHECK: vbroadcastss
+;CHECK: vpadd
+ %A = getelementptr i8, <4 x i8*> %param, i32 %off
+ ret <4 x i8*> %A
+;CHECK: ret
+}
+
+;CHECK-LABEL: AGEP8:
+define <4 x i16*> @AGEP8(i16* %param, <4 x i32> %off) nounwind {
+entry:
+; Multiply offset by two (add it to itself).
+;CHECK: vpadd
+; add the base to the offset
+;CHECK: vbroadcastss
+;CHECK-NEXT: vpadd
+ %A = getelementptr i16, i16* %param, <4 x i32> %off
+ ret <4 x i16*> %A
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index aafc05b2ed4c..8e79493ddd07 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -160,14 +160,14 @@ entry:
define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_test1:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_test1:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: retq
@@ -196,7 +196,7 @@ entry:
define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_test2:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movd (%rdi), %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
@@ -204,7 +204,7 @@ define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
;
; SSSE3-LABEL: load_sext_test2:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movd (%rdi), %xmm0
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
@@ -280,7 +280,7 @@ entry:
define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
; SSE2-LABEL: load_sext_test4:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movd (%rdi), %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -290,7 +290,7 @@ define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
;
; SSSE3-LABEL: load_sext_test4:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movd (%rdi), %xmm0
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
@@ -322,7 +322,7 @@ entry:
define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
; SSE2-LABEL: load_sext_test5:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -330,7 +330,7 @@ define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
;
; SSSE3-LABEL: load_sext_test5:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -360,14 +360,14 @@ entry:
define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
; SSE2-LABEL: load_sext_test6:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_test6:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
; SSSE3-NEXT: retq
@@ -463,20 +463,20 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: sext_16i8_to_16i16:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: movq 8(%rdi), %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_16i16:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: movq 8(%rdi), %xmm1
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm1
; SSSE3-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 4fd2f8b51b8b..61b30154950d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -10,43 +10,43 @@
define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: var_shift_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: movd %xmm1, %rcx
-; SSE2-NEXT: sarq %cl, %rax
-; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rcx
-; SSE2-NEXT: sarq %cl, %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movd %xmm1, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: pextrq $1, %xmm1, %rcx
-; SSE41-NEXT: sarq %cl, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movd %xmm1, %rcx
-; SSE41-NEXT: sarq %cl, %rax
-; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: pextrq $1, %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movd %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v2i64:
; AVX: # BB#0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: sarq %cl, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: sarq %cl, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: sarq %cl, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: sarq %cl, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX-NEXT: retq
%shift = ashr <2 x i64> %a, %b
@@ -56,73 +56,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: var_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: sarl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT: movd %xmm3, %ecx
-; SSE2-NEXT: sarl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: sarl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: sarl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $32, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad %xmm2, %xmm4
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrad %xmm4, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psrad %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $1, %xmm0, %eax
-; SSE41-NEXT: pextrd $1, %xmm1, %ecx
-; SSE41-NEXT: sarl %cl, %eax
-; SSE41-NEXT: movd %xmm0, %edx
-; SSE41-NEXT: movd %xmm1, %ecx
-; SSE41-NEXT: sarl %cl, %edx
-; SSE41-NEXT: movd %edx, %xmm2
-; SSE41-NEXT: pinsrd $1, %eax, %xmm2
-; SSE41-NEXT: pextrd $2, %xmm0, %eax
-; SSE41-NEXT: pextrd $2, %xmm1, %ecx
-; SSE41-NEXT: sarl %cl, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm2
-; SSE41-NEXT: pextrd $3, %xmm0, %eax
-; SSE41-NEXT: pextrd $3, %xmm1, %ecx
-; SSE41-NEXT: sarl %cl, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrad %xmm2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad %xmm1, %xmm2
+; SSE41-NEXT: psrad %xmm3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
@@ -136,84 +126,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: var_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -221,9 +211,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shift = ashr <8 x i16> %a, %b
@@ -234,123 +224,123 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: var_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: psllw $5, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $2, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = ashr <16 x i8> %a, %b
ret <16 x i8> %shift
@@ -363,61 +353,61 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: splatvar_shift_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: movd %xmm2, %rcx
-; SSE2-NEXT: sarq %cl, %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rcx
-; SSE2-NEXT: sarq %cl, %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movd %xmm2, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: pextrq $1, %xmm1, %rcx
-; SSE41-NEXT: sarq %cl, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movd %xmm1, %rcx
-; SSE41-NEXT: sarq %cl, %rax
-; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: pextrq $1, %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movd %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX1-NEXT: sarq %cl, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: sarq %cl, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i64:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: sarq %cl, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: sarq %cl, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: retq
%splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i64> %a, %splat
@@ -453,10 +443,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: splatvar_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: psraw %xmm1, %xmm0
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
@@ -481,160 +471,160 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: psraw $4, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: psraw $2, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pshufb %xmm0, %xmm1
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
%splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
%shift = ashr <16 x i8> %a, %splat
@@ -648,36 +638,36 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: sarq %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: sarq $7, %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq $7, %rax
+; SSE2-NEXT: movd %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: sarq $7, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: sarq %rax
-; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: sarq $7, %rax
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: sarq %rax
+; SSE41-NEXT: movd %rax, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v2i64:
; AVX: # BB#0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: sarq $7, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: sarq %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: sarq $7, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: sarq %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%shift = ashr <2 x i64> %a, <i64 1, i64 7>
@@ -687,58 +677,43 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
; SSE2-LABEL: constant_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: sarl $7, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: sarl $5, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: sarl $4, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: sarl $6, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $7, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $6, %xmm2
+; SSE2-NEXT: psrad $4, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $1, %xmm0, %eax
-; SSE41-NEXT: sarl $5, %eax
-; SSE41-NEXT: movd %xmm0, %ecx
-; SSE41-NEXT: sarl $4, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; SSE41-NEXT: pextrd $2, %xmm0, %eax
-; SSE41-NEXT: sarl $6, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; SSE41-NEXT: pextrd $3, %xmm0, %eax
-; SSE41-NEXT: sarl $7, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $7, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $5, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $6, %xmm1
+; SSE41-NEXT: psrad $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: sarl $5, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: sarl $4, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: sarl $6, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: sarl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
@@ -752,56 +727,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
; SSE2-LABEL: constant_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psraw $4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: psraw $2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $8, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -809,9 +784,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -822,126 +797,126 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
; SSE2-LABEL: constant_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: psraw $4, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: psraw $2, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT: psllw $5, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm3
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psraw $4, %xmm3
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psraw $2, %xmm3
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psraw $1, %xmm3
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $4, %xmm3
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $2, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $1, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <16 x i8> %shift
@@ -954,38 +929,35 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
; SSE2-LABEL: splatconstant_shift_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: sarq $7, %rax
-; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: sarq $7, %rax
-; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrlq $7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatconstant_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: sarq $7, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: sarq $7, %rax
-; SSE41-NEXT: movd %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $7, %xmm1
+; SSE41-NEXT: psrlq $7, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: splatconstant_shift_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: sarq $7, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: sarq $7, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: splatconstant_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1
+; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
%shift = ashr <2 x i64> %a, <i64 7, i64 7>
ret <2 x i64> %shift
}
@@ -1021,20 +993,20 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; SSE-LABEL: splatconstant_shift_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 3fc377af5650..e4642558e0e4 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -63,39 +63,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: var_shift_v8i32:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpextrd $1, %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vmovd %xmm2, %edx
-; AVX1-NEXT: vmovd %xmm3, %ecx
-; AVX1-NEXT: sarl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX1-NEXT: vpextrd $2, %xmm2, %eax
-; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
-; AVX1-NEXT: vpextrd $3, %xmm2, %eax
-; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm3
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT: sarl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -489,32 +480,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; AVX1-LABEL: constant_shift_v8i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm1, %eax
-; AVX1-NEXT: sarl $9, %eax
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: sarl $8, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm1, %eax
-; AVX1-NEXT: sarl $8, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm1, %eax
-; AVX1-NEXT: sarl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: sarl $5, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: sarl $4, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: sarl $6, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: sarl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i32:
@@ -663,41 +642,20 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
; AVX1-LABEL: splatconstant_shift_v4i64:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm1, %rax
-; AVX1-NEXT: sarq $7, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovq %xmm1, %rax
-; AVX1-NEXT: sarq $7, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: sarq $7, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: sarq $7, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: sarq $7, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: sarq $7, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: sarq $7, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: sarq $7, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
+; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
@@ -756,11 +714,11 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
;
; AVX2-LABEL: splatconstant_shift_v32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index f5a7e28383fe..ca55800e2713 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm3, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: psrlq %xmm3, %xmm2
+; SSE2-NEXT: psrlq %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: psrlq %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
@@ -46,73 +46,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: var_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: shrl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT: movd %xmm3, %ecx
-; SSE2-NEXT: shrl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: shrl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: shrl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $32, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrld %xmm2, %xmm4
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psrld %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $1, %xmm0, %eax
-; SSE41-NEXT: pextrd $1, %xmm1, %ecx
-; SSE41-NEXT: shrl %cl, %eax
-; SSE41-NEXT: movd %xmm0, %edx
-; SSE41-NEXT: movd %xmm1, %ecx
-; SSE41-NEXT: shrl %cl, %edx
-; SSE41-NEXT: movd %edx, %xmm2
-; SSE41-NEXT: pinsrd $1, %eax, %xmm2
-; SSE41-NEXT: pextrd $2, %xmm0, %eax
-; SSE41-NEXT: pextrd $2, %xmm1, %ecx
-; SSE41-NEXT: shrl %cl, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm2
-; SSE41-NEXT: pextrd $3, %xmm0, %eax
-; SSE41-NEXT: pextrd $3, %xmm1, %ecx
-; SSE41-NEXT: shrl %cl, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld %xmm1, %xmm2
+; SSE41-NEXT: psrld %xmm3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
@@ -126,84 +116,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: var_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -211,9 +201,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shift = lshr <8 x i16> %a, %b
@@ -223,72 +213,72 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: var_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = lshr <16 x i8> %a, %b
@@ -343,10 +333,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: splatvar_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
@@ -370,99 +360,99 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pshufb %xmm0, %xmm1
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psrlw $4, %xmm4
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm4
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
%splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
%shift = lshr <16 x i8> %a, %splat
@@ -477,24 +467,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $7, %xmm1
-; SSE2-NEXT: psrlq $1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: psrlq $7, %xmm1
+; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $7, %xmm1
-; SSE41-NEXT: psrlq $1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlq $7, %xmm1
+; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
@@ -509,59 +499,44 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
; SSE2-LABEL: constant_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: shrl $7, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: shrl $5, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: shrl $4, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: shrl $6, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $7, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrld $5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrld $6, %xmm2
+; SSE2-NEXT: psrld $4, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $1, %xmm0, %eax
-; SSE41-NEXT: shrl $5, %eax
-; SSE41-NEXT: movd %xmm0, %ecx
-; SSE41-NEXT: shrl $4, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; SSE41-NEXT: pextrd $2, %xmm0, %eax
-; SSE41-NEXT: shrl $6, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; SSE41-NEXT: pextrd $3, %xmm0, %eax
-; SSE41-NEXT: shrl $7, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $7, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $5, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $6, %xmm1
+; SSE41-NEXT: psrld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: shrl $5, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: shrl $6, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
-; AVX1-NEXT: retq
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
; AVX2: # BB#0:
@@ -574,56 +549,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
; SSE2-LABEL: constant_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -631,9 +606,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -643,72 +618,72 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
; SSE2-LABEL: constant_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -764,14 +739,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; SSE-LABEL: splatconstant_shift_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsrlw $3, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index d200abd5f875..bb0cceed7720 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -33,39 +33,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: var_shift_v8i32:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpextrd $1, %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vmovd %xmm2, %edx
-; AVX1-NEXT: vmovd %xmm3, %ecx
-; AVX1-NEXT: shrl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX1-NEXT: vpextrd $2, %xmm2, %eax
-; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
-; AVX1-NEXT: vpextrd $3, %xmm2, %eax
-; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %edx
-; AVX1-NEXT: vmovd %edx, %xmm3
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT: shrl %cl, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -167,17 +158,17 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
;
; AVX2-LABEL: var_shift_v32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
%shift = lshr <32 x i8> %a, %b
@@ -334,32 +325,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; AVX1-LABEL: constant_shift_v8i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm1, %eax
-; AVX1-NEXT: shrl $9, %eax
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm1, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm1, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: shrl $5, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: shrl $6, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i32:
@@ -453,18 +432,18 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
;
; AVX2-LABEL: constant_shift_v32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
%shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -540,8 +519,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
;
; AVX2-LABEL: splatconstant_shift_v32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlw $3, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0
+; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
%shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 3ac31ea63676..6dbd9eab2a72 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm3, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: psllq %xmm3, %xmm2
+; SSE2-NEXT: psllq %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: psllq %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
@@ -46,33 +46,33 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: var_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
; SSE41: # BB#0:
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
@@ -86,84 +86,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: var_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psllw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psllw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psllw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -171,9 +171,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shift = shl <8 x i16> %a, %b
@@ -183,69 +183,69 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: var_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = shl <16 x i8> %a, %b
@@ -300,10 +300,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: splatvar_shift_v8i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
@@ -327,95 +327,95 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pshufb %xmm0, %xmm1
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psllw $4, %xmm4
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psllw $4, %xmm4
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $2, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $2, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
%splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
%shift = shl <16 x i8> %a, %splat
@@ -430,24 +430,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $7, %xmm1
-; SSE2-NEXT: psllq $1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: psllq $7, %xmm1
+; SSE2-NEXT: psllq $1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $7, %xmm1
-; SSE41-NEXT: psllq $1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $7, %xmm1
+; SSE41-NEXT: psllq $1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
@@ -462,13 +462,13 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
; SSE2-LABEL: constant_shift_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
@@ -507,69 +507,69 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
; SSE2-LABEL: constant_shift_v16i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $2, %xmm2
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: paddb %xmm2, %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm2, %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -625,14 +625,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; SSE-LABEL: splatconstant_shift_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpsllw $3, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 7c13c0ae4716..b287875f6541 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -193,7 +193,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX2-LABEL: splatvar_shift_v8i32:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -341,7 +341,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm4 # xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $2, %xmm1, %xmm2
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
new file mode 100644
index 000000000000..26062335cc16
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -0,0 +1,221 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
+
+;
+; EXTRQI
+;
+
+define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm0, %xmm1
+; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm0, %xmm1
+; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_1zzzuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_12zzuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_012zuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
+; BTVER1-LABEL: shuf_0zzz1zzz:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm0, %xmm1
+; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_0zzz1zzz:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
+ ret <8 x i16> %s
+}
+
+define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
+; BTVER1-LABEL: shuf_0z1z:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: pxor %xmm1, %xmm1
+; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_0z1z:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+ ret <4 x i32> %s
+}
+
+;
+; INSERTQI
+;
+
+define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0823uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0183uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0128uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0893uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089Auuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089uuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %s
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index d2eef9af2a25..2480e676cad0 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -223,15 +223,15 @@ entry:
}
define <16 x i8> @trunc16i64_const() {
-; SSE-LABEL: trunc16i64_const
-; SSE: # BB#0: # %entry
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: retq
+; SSE-LABEL: trunc16i64_const:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
;
-; AVX-LABEL: trunc16i64_const
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX-LABEL: trunc16i64_const:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%0 = trunc <16 x i64> zeroinitializer to <16 x i8>
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index c64e17442675..b119f5eb89f6 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -11,7 +11,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pand .LCPI0_0(%rip), %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_8i32:
@@ -20,7 +20,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pand .LCPI0_0(%rip), %xmm1
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_8i32:
@@ -28,7 +28,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pand .LCPI0_0(%rip), %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_8i32:
@@ -156,7 +156,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pand .LCPI3_0(%rip), %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_16i16:
@@ -165,15 +165,15 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: pand .LCPI3_0(%rip), %xmm1
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pand .LCPI3_0(%rip), %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_16i16:
@@ -195,24 +195,24 @@ entry:
define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: load_zext_16i8_to_16i16:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pand .LCPI4_0(%rip), %xmm1
-; SSE2-NEXT: retq
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_16i8_to_16i16:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: pand .LCPI4_0(%rip), %xmm1
-; SSSE3-NEXT: retq
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
@@ -239,24 +239,24 @@ entry:
define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-LABEL: load_zext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pand .LCPI5_0(%rip), %xmm1
-; SSE2-NEXT: retq
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pand .LCPI5_0(%rip), %xmm1
-; SSSE3-NEXT: retq
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i16_to_8i32:
; SSE41: # BB#0: # %entry
@@ -415,7 +415,7 @@ entry:
define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE2-LABEL: shuf_zext_8i8_to_8i32:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pand .LCPI9_0(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll
index cf592b1e9f42..298683559054 100644
--- a/test/CodeGen/X86/vector-zmov.ll
+++ b/test/CodeGen/X86/vector-zmov.ll
@@ -5,15 +5,16 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) {
-; SSE-LABEL: load_zmov_4i32_to_0zzz:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: movd (%rdi), %xmm0
-; SSE-NEXT: retq
+; SSE-LABEL: load_zmov_4i32_to_0zzz:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: load_zmov_4i32_to_0zzz:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
-; AVX-LABEL: load_zmov_4i32_to_0zzz:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovd (%rdi), %xmm0
-; AVX-NEXT: retq
entry:
%X = load <4 x i32>, <4 x i32>* %ptr
%Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -21,15 +22,16 @@ entry:
}
define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) {
-; SSE-LABEL: load_zmov_2i64_to_0z:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: movq (%rdi), %xmm0
-; SSE-NEXT: retq
+; SSE-LABEL: load_zmov_2i64_to_0z:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: load_zmov_2i64_to_0z:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
-; AVX-LABEL: load_zmov_2i64_to_0z:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovq (%rdi), %xmm0
-; AVX-NEXT: retq
entry:
%X = load <2 x i64>, <2 x i64>* %ptr
%Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/test/CodeGen/X86/visibility.ll b/test/CodeGen/X86/visibility.ll
index 580c3dc9266d..be7fd96f2dd9 100644
--- a/test/CodeGen/X86/visibility.ll
+++ b/test/CodeGen/X86/visibility.ll
@@ -2,13 +2,19 @@
@zed = external hidden constant i32
+define available_externally hidden void @baz() {
+ ret void
+}
+
define hidden void @foo() nounwind {
entry:
call void @bar(i32* @zed)
+ call void @baz()
ret void
}
declare hidden void @bar(i32*)
;CHECK: .hidden zed
+;CHECK: .hidden baz
;CHECK: .hidden bar
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index 0bdb32fcb86e..f368029e4b49 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -3,13 +3,12 @@
; test vector shifts converted to proper SSE2 vector shifts when the shift
; amounts are the same.
-; Note that x86 does have ashr
+; Note that x86 does have ashr
-; shift1a can't use a packed shift
define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
entry:
; CHECK-LABEL: shift1a:
-; CHECK: sarl
+; CHECK: psrad $31
%ashr = ashr <2 x i64> %val, < i64 32, i64 32 >
store <2 x i64> %ashr, <2 x i64>* %dst
ret void
diff --git a/test/CodeGen/X86/webkit-jscc.ll b/test/CodeGen/X86/webkit-jscc.ll
new file mode 100644
index 000000000000..a58c53e024ec
--- /dev/null
+++ b/test/CodeGen/X86/webkit-jscc.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-gnu -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-msvc -mcpu=corei7 < %s | FileCheck %s
+
+define webkit_jscc i32 @simple_jscall(i32 %a, i32 %b, i32 %c) {
+ %ab = add i32 %a, %b
+ %abc = add i32 %ab, %c
+ ret i32 %abc
+}
+
+; 32-bit integers are only aligned to 4 bytes, even on x64. They are *not*
+; promoted to i64.
+
+; CHECK: simple_jscall:
+; CHECK: addl 8(%rsp), %eax
+; CHECK-NEXT: addl 12(%rsp), %eax
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index 906f7cdafb95..c8646c6489a1 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,8 +1,9 @@
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: {{cwtl|movswl}}
-; CHECK: {{cwtl|movswl}}
+; CHECK: psllq $48, %xmm0
+; CHECK: psrad $16, %xmm0
+; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; sign extension v2i32 to v2i16
+; sign extension v2i16 to v2i32
define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
entry:
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index f5ddc0eacc61..6f1bd7541231 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -194,17 +194,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]]
; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]]
; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]]
-; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: shrl %e[[R0]]x
-; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x
-; CHECK-NEXT: shrl %e[[R1]]x
-; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]]
-; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]]
-; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: shrl %e[[R0]]x
-; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]]
-; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]]
+; CHECK-NEXT: psrld $1, %[[X1]]
+; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]]
; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X1]]
; CHECK-NEXT: pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]]
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index f235d2884d03..3ee4723ce5f3 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -32,16 +32,19 @@ eh.resume:
; CHECK-LABEL: _use_except_handler3:
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
+; CHECK: pushl %ebx
+; CHECK: pushl %edi
+; CHECK: pushl %esi
; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl $-1, -4(%ebp)
-; CHECK: movl $L__ehtable$use_except_handler3, -8(%ebp)
-; CHECK: leal -16(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler3, -12(%ebp)
+; CHECK: movl $-1, -16(%ebp)
+; CHECK: movl $L__ehtable$use_except_handler3, -20(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler3, -24(%ebp)
; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -16(%ebp)
+; CHECK: movl %[[next]], -28(%ebp)
; CHECK: movl %[[node]], %fs:0
; CHECK: calll _may_throw_or_crash
-; CHECK: movl -16(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
; CHECK: movl %[[next]], %fs:0
; CHECK: retl
@@ -72,18 +75,18 @@ eh.resume:
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -24(%ebp)
-; CHECK: movl $-2, -4(%ebp)
+; CHECK: movl %esp, -36(%ebp)
+; CHECK: movl $-2, -16(%ebp)
; CHECK: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]]
; CHECK: xorl ___security_cookie, %[[lsda]]
-; CHECK: movl %[[lsda]], -8(%ebp)
-; CHECK: leal -16(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler4, -12(%ebp)
+; CHECK: movl %[[lsda]], -20(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler4, -24(%ebp)
; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -16(%ebp)
+; CHECK: movl %[[next]], -28(%ebp)
; CHECK: movl %[[node]], %fs:0
; CHECK: calll _may_throw_or_crash
-; CHECK: movl -16(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
; CHECK: movl %[[next]], %fs:0
; CHECK: retl
@@ -115,20 +118,21 @@ catchall:
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -16(%ebp)
-; CHECK: movl $-1, -4(%ebp)
-; CHECK: leal -12(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -8(%ebp)
+; CHECK: movl %esp, -28(%ebp)
+; CHECK: movl $-1, -16(%ebp)
+; CHECK: leal -24(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp)
; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -12(%ebp)
+; CHECK: movl %[[next]], -24(%ebp)
; CHECK: movl %[[node]], %fs:0
-; CHECK: movl $0, -4(%ebp)
+; CHECK: movl $0, -16(%ebp)
; CHECK: calll _may_throw_or_crash
-; CHECK: movl -12(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -24(%ebp), %[[next:[^ ,]*]]
; CHECK: movl %[[next]], %fs:0
; CHECK: retl
; CHECK: .section .xdata,"dr"
+; CHECK: .align 4
; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3:
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 2
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
index 2c62f4918a7f..477b3144d9e7 100644
--- a/test/CodeGen/X86/win64_frame.ll
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -100,8 +100,9 @@ define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="
alloca i32, i32 %a
; CHECK: movl %ecx, %eax
- ; CHECK: leaq 15(,%rax,4), %rax
- ; CHECK: andq $-16, %rax
+ ; CHECK: leaq 15(,%rax,4), %rcx
+ ; CHECK: movabsq $34359738352, %rax
+ ; CHECK: andq %rcx, %rax
; CHECK: callq __chkstk
; CHECK: subq %rax, %rsp
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 5848eddf4375..8c91335d91a2 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -598,3 +598,42 @@ if.then.60: ; preds = %if.end.55
cleanup: ; preds = %if.then.60, %if.end.55, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %if.end, %entry
ret void
}
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: pushq
+;
+; CHECK: testb %dil, %dil
+; CHECK-NEXT: jne [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: movl $42, %eax
+;
+; DISABLE-NEXT: popq
+;
+; CHECK-NEXT: retq
+;
+; CHECK: [[ABORT]]: ## %if.abort
+;
+; ENABLE: pushq
+;
+; CHECK: callq _abort
+; ENABLE-NOT: popq
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+ %tobool = icmp eq i8 %bad_thing, 0
+ br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+ tail call void @abort() #0
+ unreachable
+
+if.end:
+ ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }