summaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-pr32733.mir65
-rw-r--r--test/CodeGen/AArch64/arm64-vmul.ll16
-rw-r--r--test/CodeGen/AArch64/fence-singlethread.ll21
-rw-r--r--test/CodeGen/AArch64/optimize-imm.ll64
-rw-r--r--test/CodeGen/AArch64/swiftself-scavenger.ll82
-rw-r--r--test/CodeGen/AMDGPU/add.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/addrspacecast.ll9
-rw-r--r--test/CodeGen/AMDGPU/ashr.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/code-object-metadata-images.ll80
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/fdiv.ll18
-rw-r--r--test/CodeGen/AMDGPU/fence-amdgiz.ll15
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.v2f16.ll16
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/immv216.ll2
-rw-r--r--test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll9
-rw-r--r--test/CodeGen/AMDGPU/loop_break.ll2
-rw-r--r--test/CodeGen/AMDGPU/lshr.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/merge-m0.mir132
-rw-r--r--test/CodeGen/AMDGPU/mubuf-offset-private.ll136
-rw-r--r--test/CodeGen/AMDGPU/multi-divergent-exit-region.ll180
-rw-r--r--test/CodeGen/AMDGPU/nested-loop-conditions.ll23
-rw-r--r--test/CodeGen/AMDGPU/private-access-no-objects.ll16
-rw-r--r--test/CodeGen/AMDGPU/readcyclecounter.ll14
-rw-r--r--test/CodeGen/AMDGPU/ret_jump.ll2
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg.ll2
-rw-r--r--test/CodeGen/AMDGPU/shl.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/sminmax.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/spill-m0.ll22
-rw-r--r--test/CodeGen/AMDGPU/sub.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/trap.ll21
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir127
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll68
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-isel.ll32
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir230
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir79
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-unsupported.ll80
-rw-r--r--test/CodeGen/ARM/bool-ext-inc.ll39
-rw-r--r--test/CodeGen/ARM/fence-singlethread.ll16
-rw-r--r--test/CodeGen/ARM/v6m-smul-with-overflow.ll16
-rw-r--r--test/CodeGen/ARM/vpadd.ll20
-rw-r--r--test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir35
-rw-r--r--test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir29
-rw-r--r--test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir35
-rw-r--r--test/CodeGen/MSP430/select-use-sr.ll21
-rw-r--r--test/CodeGen/Mips/llvm-ir/mul.ll2
-rw-r--r--test/CodeGen/Mips/llvm-ir/sdiv.ll12
-rw-r--r--test/CodeGen/Mips/llvm-ir/srem.ll11
-rw-r--r--test/CodeGen/Mips/llvm-ir/udiv.ll11
-rw-r--r--test/CodeGen/Mips/llvm-ir/urem.ll6
-rw-r--r--test/CodeGen/Mips/micromips-gp-rc.ll2
-rw-r--r--test/CodeGen/Mips/mips64fpldst.ll12
-rw-r--r--test/CodeGen/Mips/tailcall/tailcall.ll4
-rw-r--r--test/CodeGen/PowerPC/empty-functions.ll6
-rw-r--r--test/CodeGen/SPARC/empty-functions.ll10
-rw-r--r--test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll229
-rw-r--r--test/CodeGen/Thumb/long.ll31
-rw-r--r--test/CodeGen/Thumb/optionaldef-scheduling.ll18
-rw-r--r--test/CodeGen/X86/GlobalISel/callingconv.ll133
-rw-r--r--test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll30
-rw-r--r--test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll8
-rw-r--r--test/CodeGen/X86/GlobalISel/memop.ll4
-rw-r--r--test/CodeGen/X86/asm-reg-type-mismatch.ll2
-rw-r--r--test/CodeGen/X86/atomic-non-integer.ll8
-rw-r--r--test/CodeGen/X86/avx-schedule.ll2840
-rw-r--r--test/CodeGen/X86/bitcast2.ll2
-rw-r--r--test/CodeGen/X86/bool-ext-inc.ll88
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll8
-rw-r--r--test/CodeGen/X86/combine-srl.ll9
-rw-r--r--test/CodeGen/X86/combine-udiv.ll47
-rw-r--r--test/CodeGen/X86/combine-urem.ll93
-rw-r--r--test/CodeGen/X86/constant-hoisting-bfi.ll115
-rw-r--r--test/CodeGen/X86/dagcombine-cse.ll2
-rw-r--r--test/CodeGen/X86/dwarf-headers.ll8
-rw-r--r--test/CodeGen/X86/eh-frame-unreachable.ll11
-rw-r--r--test/CodeGen/X86/empty-function.ll22
-rw-r--r--test/CodeGen/X86/empty-functions.ll8
-rw-r--r--test/CodeGen/X86/extractelement-index.ll8
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll1
-rw-r--r--test/CodeGen/X86/gather-addresses.ll4
-rw-r--r--test/CodeGen/X86/i256-add.ll49
-rw-r--r--test/CodeGen/X86/i64-to-float.ll4
-rw-r--r--test/CodeGen/X86/insertelement-duplicates.ll58
-rw-r--r--test/CodeGen/X86/isint.ll8
-rw-r--r--test/CodeGen/X86/lower-bitcast.ll16
-rw-r--r--test/CodeGen/X86/memcpy-struct-by-value.ll48
-rw-r--r--test/CodeGen/X86/merge_store.ll11
-rw-r--r--test/CodeGen/X86/mmx-bitcast.ll2
-rw-r--r--test/CodeGen/X86/mmx-cvt.ll2
-rw-r--r--test/CodeGen/X86/mod128.ll2
-rw-r--r--test/CodeGen/X86/movmsk.ll2
-rw-r--r--test/CodeGen/X86/nontemporal-2.ll4
-rw-r--r--test/CodeGen/X86/post-ra-sched-with-debug.mir322
-rw-r--r--test/CodeGen/X86/pr14657.ll325
-rw-r--r--test/CodeGen/X86/pr18344.ll2
-rw-r--r--test/CodeGen/X86/pr21792.ll2
-rw-r--r--test/CodeGen/X86/pr22970.ll47
-rw-r--r--test/CodeGen/X86/pr30511.ll2
-rw-r--r--test/CodeGen/X86/pshufb-mask-comments.ll2
-rw-r--r--test/CodeGen/X86/ret-mmx.ll2
-rw-r--r--test/CodeGen/X86/sad_variations.ll6
-rw-r--r--test/CodeGen/X86/scalar-int-to-fp.ll2
-rw-r--r--test/CodeGen/X86/setcc-combine.ll181
-rw-r--r--test/CodeGen/X86/setcc-wide-types.ll32
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll4
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll4
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll12
-rw-r--r--test/CodeGen/X86/sse2-schedule.ll58
-rw-r--r--test/CodeGen/X86/sse3-schedule.ll455
-rw-r--r--test/CodeGen/X86/sse41-schedule.ll1938
-rw-r--r--test/CodeGen/X86/sse42-schedule.ll477
-rw-r--r--test/CodeGen/X86/ssse3-schedule.ll754
-rw-r--r--test/CodeGen/X86/statepoint-vector.ll2
-rw-r--r--test/CodeGen/X86/tls-pic.ll38
-rw-r--r--test/CodeGen/X86/tls-pie.ll127
-rw-r--r--test/CodeGen/X86/tls.ll246
-rw-r--r--test/CodeGen/X86/vec_fneg.ll4
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll106
-rw-r--r--test/CodeGen/X86/vec_insert-3.ll2
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll2
-rw-r--r--test/CodeGen/X86/vec_insert-mmx.ll2
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll112
-rw-r--r--test/CodeGen/X86/vec_set-8.ll2
-rw-r--r--test/CodeGen/X86/vec_set-C.ll2
-rw-r--r--test/CodeGen/X86/vec_shift7.ll2
-rw-r--r--test/CodeGen/X86/vector-compare-all_of.ll8
-rw-r--r--test/CodeGen/X86/vector-compare-any_of.ll8
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-128.ll28
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-128.ll28
-rw-r--r--test/CodeGen/X86/vector-lzcnt-128.ll4
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll27
-rw-r--r--test/CodeGen/X86/vector-sext.ll68
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v2.ll12
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll10
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx2.ll26
-rw-r--r--test/CodeGen/X86/vector-trunc-math.ll10
-rw-r--r--test/CodeGen/X86/vector-trunc.ll14
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll4
-rw-r--r--test/CodeGen/X86/vmovq.ll2
-rw-r--r--test/CodeGen/X86/vshift-1.ll2
-rw-r--r--test/CodeGen/X86/vshift-2.ll2
-rw-r--r--test/CodeGen/X86/vsplit-and.ll10
-rw-r--r--test/CodeGen/X86/widen_cast-5.ll2
-rw-r--r--test/CodeGen/X86/widen_conv-3.ll4
-rw-r--r--test/CodeGen/X86/widen_conv-4.ll4
147 files changed, 10435 insertions, 932 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir
new file mode 100644
index 0000000000000..96436209451b0
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir
@@ -0,0 +1,65 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ define i32 @main() {
+ entry:
+ ret i32 0
+ }
+
+ declare i32 @printf(i8*, ...)
+...
+---
+# CHECK-LABEL: name: main
+name: main
+alignment: 2
+exposesReturnsTwice: false
+noVRegs: false
+legalized: true
+regBankSelected: true
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+ - { id: 6, class: gpr }
+ - { id: 7, class: gpr }
+ - { id: 8, class: gpr }
+ - { id: 9, class: gpr }
+ - { id: 10, class: gpr }
+ - { id: 11, class: gpr }
+ - { id: 12, class: gpr }
+ - { id: 13, class: gpr }
+ - { id: 14, class: gpr }
+ - { id: 15, class: gpr }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 8
+ adjustsStack: false
+ hasCalls: true
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+# CHECK: body:
+# CHECK: %1 = COPY %w0
+# CHECK-NOT: %2 = ORNWrr %wzr, %1
+# CHECK: %4 = EONWrr %1, %3
+body: |
+ bb.1.entry:
+ liveins: %w0
+ %0(s32) = G_CONSTANT i32 -1
+ %3(s32) = G_CONSTANT i32 1
+ %1(s32) = COPY %w0
+ %2(s32) = G_XOR %1, %0
+ %4(s32) = G_XOR %2, %3
+ %w0 = COPY %4(s32)
+...
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
index a5fa78abb92f4..a7668ec97979c 100644
--- a/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1201,35 +1201,35 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
; Scalar FMULX
define float @fmulxs(float %a, float %b) nounwind {
; CHECK-LABEL: fmulxs:
-; CHECKNEXT: fmulx s0, s0, s1
+; CHECK-NEXT: fmulx s0, s0, s1
%fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
+; CHECK-NEXT: ret
ret float %fmulx.i
}
define double @fmulxd(double %a, double %b) nounwind {
; CHECK-LABEL: fmulxd:
-; CHECKNEXT: fmulx d0, d0, d1
+; CHECK-NEXT: fmulx d0, d0, d1
%fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
+; CHECK-NEXT: ret
ret double %fmulx.i
}
define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
; CHECK-LABEL: fmulxs_lane:
-; CHECKNEXT: fmulx.s s0, s0, v1[3]
+; CHECK-NEXT: fmulx.s s0, s0, v1[3]
%b = extractelement <4 x float> %vec, i32 3
%fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
+; CHECK-NEXT: ret
ret float %fmulx.i
}
define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
; CHECK-LABEL: fmulxd_lane:
-; CHECKNEXT: fmulx d0, d0, v1[1]
+; CHECK-NEXT: fmulx.d d0, d0, v1[1]
%b = extractelement <2 x double> %vec, i32 1
%fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
+; CHECK-NEXT: ret
ret double %fmulx.i
}
diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll
new file mode 100644
index 0000000000000..2ed744277385a
--- /dev/null
+++ b/test/CodeGen/AArch64/fence-singlethread.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=LINUX
+; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=IOS
+; RUN: llc -mtriple=aarch64-linux-gnueabihf %s -filetype=obj -o %t
+; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ
+
+; OBJ-NOT: dmb
+
+define void @fence_singlethread() {
+; LINUX-LABEL: fence_singlethread:
+; LINUX-NOT: dmb
+; LINUX: // COMPILER BARRIER
+; LINUX-NOT: dmb
+
+; IOS-LABEL: fence_singlethread:
+; IOS-NOT: dmb
+; IOS: ; COMPILER BARRIER
+; IOS-NOT: dmb
+
+ fence singlethread seq_cst
+ ret void
+}
diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll
new file mode 100644
index 0000000000000..a4725c65aa26f
--- /dev/null
+++ b/test/CodeGen/AArch64/optimize-imm.ll
@@ -0,0 +1,64 @@
+; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s
+
+; CHECK-LABEL: and1:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd
+
+define void @and1(i32 %a, i8* nocapture %p) {
+entry:
+ %and = and i32 %a, 253
+ %conv = trunc i32 %and to i8
+ store i8 %conv, i8* %p, align 1
+ ret void
+}
+
+; (a & 0x3dfd) | 0xffffc000
+;
+; CHECK-LABEL: and2:
+; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd
+
+define i32 @and2(i32 %a) {
+entry:
+ %and = and i32 %a, 15869
+ %or = or i32 %and, -16384
+ ret i32 %or
+}
+
+; (a & 0x19) | 0xffffffc0
+;
+; CHECK-LABEL: and3:
+; CHECK: and {{w[0-9]+}}, w0, #0x99999999
+
+define i32 @and3(i32 %a) {
+entry:
+ %and = and i32 %a, 25
+ %or = or i32 %and, -64
+ ret i32 %or
+}
+
+; (a & 0xc5600) | 0xfff1f1ff
+;
+; CHECK-LABEL: and4:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff
+
+define i32 @and4(i32 %a) {
+entry:
+ %and = and i32 %a, 787968
+ %or = or i32 %and, -921089
+ ret i32 %or
+}
+
+; Make sure we don't shrink or optimize an XOR's immediate operand if the
+; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into
+; a BIC.
+
+; CHECK-LABEL: xor1:
+; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38
+; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3
+
+define i32 @xor1(i32 %a) {
+entry:
+ %shl = shl i32 %a, 3
+ %xor = and i32 %shl, 56
+ %and = xor i32 %xor, 56
+ ret i32 %and
+}
diff --git a/test/CodeGen/AArch64/swiftself-scavenger.ll b/test/CodeGen/AArch64/swiftself-scavenger.ll
new file mode 100644
index 0000000000000..6d02784409317
--- /dev/null
+++ b/test/CodeGen/AArch64/swiftself-scavenger.ll
@@ -0,0 +1,82 @@
+; RUN: llc -o - %s | FileCheck %s
+; Check that we reserve an emergency spill slot, even if we added an extra
+; CSR spill for the values used by the swiftself parameter.
+; CHECK-LABEL: func:
+; CHECK: str [[REG:x[0-9]+]], [sp, #8]
+; CHECK: add [[REG]], sp, #248
+; CHECK: str xzr, [{{\s*}}[[REG]], #32760]
+; CHECK: ldr x30, [sp, #8]
+target triple = "arm64-apple-ios"
+
+@ptr8 = external global i8*
+@ptr64 = external global i64
+
+define hidden swiftcc void @func(i8* swiftself %arg) #0 {
+bb:
+ %stack0 = alloca i8*, i32 5000, align 8
+ %stack1 = alloca i8*, i32 32, align 8
+
+ %v0 = load volatile i64, i64* @ptr64, align 8
+ %v1 = load volatile i64, i64* @ptr64, align 8
+ %v2 = load volatile i64, i64* @ptr64, align 8
+ %v3 = load volatile i64, i64* @ptr64, align 8
+ %v4 = load volatile i64, i64* @ptr64, align 8
+ %v5 = load volatile i64, i64* @ptr64, align 8
+ %v6 = load volatile i64, i64* @ptr64, align 8
+ %v7 = load volatile i64, i64* @ptr64, align 8
+ %v8 = load volatile i64, i64* @ptr64, align 8
+ %v9 = load volatile i64, i64* @ptr64, align 8
+ %v10 = load volatile i64, i64* @ptr64, align 8
+ %v11 = load volatile i64, i64* @ptr64, align 8
+ %v12 = load volatile i64, i64* @ptr64, align 8
+ %v13 = load volatile i64, i64* @ptr64, align 8
+ %v14 = load volatile i64, i64* @ptr64, align 8
+ %v15 = load volatile i64, i64* @ptr64, align 8
+ %v16 = load volatile i64, i64* @ptr64, align 8
+ %v17 = load volatile i64, i64* @ptr64, align 8
+ %v18 = load volatile i64, i64* @ptr64, align 8
+ %v19 = load volatile i64, i64* @ptr64, align 8
+ %v20 = load volatile i64, i64* @ptr64, align 8
+ %v21 = load volatile i64, i64* @ptr64, align 8
+ %v22 = load volatile i64, i64* @ptr64, align 8
+ %v23 = load volatile i64, i64* @ptr64, align 8
+ %v24 = load volatile i64, i64* @ptr64, align 8
+ %v25 = load volatile i64, i64* @ptr64, align 8
+
+ ; this should exceed stack-relative addressing limits and need an emergency
+ ; spill slot.
+ %s = getelementptr inbounds i8*, i8** %stack0, i64 4092
+ store volatile i8* null, i8** %s
+ store volatile i8* null, i8** %stack1
+
+ store volatile i64 %v0, i64* @ptr64, align 8
+ store volatile i64 %v1, i64* @ptr64, align 8
+ store volatile i64 %v2, i64* @ptr64, align 8
+ store volatile i64 %v3, i64* @ptr64, align 8
+ store volatile i64 %v4, i64* @ptr64, align 8
+ store volatile i64 %v5, i64* @ptr64, align 8
+ store volatile i64 %v6, i64* @ptr64, align 8
+ store volatile i64 %v7, i64* @ptr64, align 8
+ store volatile i64 %v8, i64* @ptr64, align 8
+ store volatile i64 %v9, i64* @ptr64, align 8
+ store volatile i64 %v10, i64* @ptr64, align 8
+ store volatile i64 %v11, i64* @ptr64, align 8
+ store volatile i64 %v12, i64* @ptr64, align 8
+ store volatile i64 %v13, i64* @ptr64, align 8
+ store volatile i64 %v14, i64* @ptr64, align 8
+ store volatile i64 %v15, i64* @ptr64, align 8
+ store volatile i64 %v16, i64* @ptr64, align 8
+ store volatile i64 %v17, i64* @ptr64, align 8
+ store volatile i64 %v18, i64* @ptr64, align 8
+ store volatile i64 %v19, i64* @ptr64, align 8
+ store volatile i64 %v20, i64* @ptr64, align 8
+ store volatile i64 %v21, i64* @ptr64, align 8
+ store volatile i64 %v22, i64* @ptr64, align 8
+ store volatile i64 %v23, i64* @ptr64, align 8
+ store volatile i64 %v24, i64* @ptr64, align 8
+ store volatile i64 %v25, i64* @ptr64, align 8
+
+ ; use swiftself parameter late so it stays alive throughout the function.
+ store volatile i8* %arg, i8** @ptr8
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index e137ef4bc2367..73e80d523f1e2 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 6ec93c72ec527..b1e71722d80c5 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
@@ -223,9 +223,8 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
}
; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
store volatile i32 7, i32* %cast
diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 96a5e3b23758a..7f424ef2a1477 100644
--- a/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-images.ll b/test/CodeGen/AMDGPU/code-object-metadata-images.ll
new file mode 100644
index 0000000000000..918560469852b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-images.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+%opencl.image1d_t = type opaque
+%opencl.image1d_array_t = type opaque
+%opencl.image1d_buffer_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image2d_array_t = type opaque
+%opencl.image2d_array_depth_t = type opaque
+%opencl.image2d_array_msaa_t = type opaque
+%opencl.image2d_array_msaa_depth_t = type opaque
+%opencl.image2d_depth_t = type opaque
+%opencl.image2d_msaa_t = type opaque
+%opencl.image2d_msaa_depth_t = type opaque
+%opencl.image3d_t = type opaque
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+
+; CHECK: Kernels:
+; CHECK: - Name: test
+; CHECK: Args:
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image1d_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image1d_array_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image1d_buffer_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_array_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_array_depth_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_array_msaa_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_array_msaa_depth_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_depth_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_msaa_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image2d_msaa_depth_t
+; CHECK: - Size: 8
+; CHECK: ValueKind: Image
+; CHECK: TypeName: image3d_t
+define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a,
+ %opencl.image1d_array_t addrspace(1)* %b,
+ %opencl.image1d_buffer_t addrspace(1)* %c,
+ %opencl.image2d_t addrspace(1)* %d,
+ %opencl.image2d_array_t addrspace(1)* %e,
+ %opencl.image2d_array_depth_t addrspace(1)* %f,
+ %opencl.image2d_array_msaa_t addrspace(1)* %g,
+ %opencl.image2d_array_msaa_depth_t addrspace(1)* %h,
+ %opencl.image2d_depth_t addrspace(1)* %i,
+ %opencl.image2d_msaa_t addrspace(1)* %j,
+ %opencl.image2d_msaa_depth_t addrspace(1)* %k,
+ %opencl.image3d_t addrspace(1)* %l)
+ !kernel_arg_type !1 !kernel_arg_base_type !1 {
+ ret void
+}
+
+!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t",
+ !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t",
+ !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t",
+ !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t",
+ !"image3d_t"}
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index f2686a5582dc6..c9787bb478ef2 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index b3a2b66437207..738a5adba14fb 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -85,10 +85,20 @@ entry:
}
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; GCN-NOT: s_setreg
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN-NOT: s_setreg
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
entry:
%fdiv = fdiv fast float %a, %b
diff --git a/test/CodeGen/AMDGPU/fence-amdgiz.ll b/test/CodeGen/AMDGPU/fence-amdgiz.ll
new file mode 100644
index 0000000000000..df675c9a8692e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fence-amdgiz.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn-amd-amdhsa-amdgizcl"
+
+; CHECK_LABEL: atomic_fence
+; CHECK: BB#0:
+; CHECK: ATOMIC_FENCE 4, 1
+; CHECK: s_endpgm
+
+define amdgpu_kernel void @atomic_fence() {
+ fence acquire
+ ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index bdd3c04fd3189..624610096cbc5 100644
--- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 555764c15519e..506b2a02f8281 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
; CI: v_cvt_f32_f16_e32
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index 85ad365d02a89..c15a30e3c5401 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
; FIXME: Merge into imm.ll
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index a3f82b8a01174..89adcff1a2787 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -216,7 +216,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]]
-; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000{{$}}
; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]]
; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 5e892fad3741b..cbd8f0a9c23a3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s
ret void
}
+; CHECK-LABEL: {{^}}test_readlane_vregs:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+ %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
+ %value = extractelement <2 x i32> %args, i32 0
+ %lane = extractelement <2 x i32> %args, i32 1
+ %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+ store i32 %readlane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
; TODO: m0 should be folded.
; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
; CHECK: s_mov_b32 m0, -1
@@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0)
ret void
}
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll
new file mode 100644
index 0000000000000..bafafa33016fa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march amdgcn %s -filetype=obj -o /dev/null
+; RUN: llc -march amdgcn <%s | FileCheck %s
+define amdgpu_kernel void @f() {
+ ; CHECK: ; divergent unreachable
+ call void @llvm.amdgcn.unreachable()
+ ret void
+}
+
+declare void @llvm.amdgcn.unreachable()
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index 84c42e8bd1e06..b9df2cb779ad0 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -10,7 +10,7 @@
; OPT: bb4:
; OPT: load volatile
-; OPT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT: xor i1 %cmp1
; OPT: call i64 @llvm.amdgcn.if.break(
; OPT: br label %Flow
diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll
index e21d0d09bb415..6a90a7a9f2eb3 100644
--- a/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir
new file mode 100644
index 0000000000000..064db49924e15
--- /dev/null
+++ b/test/CodeGen/AMDGPU/merge-m0.mir
@@ -0,0 +1,132 @@
+# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN: bb.0.entry:
+# GCN: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.1:
+# GCN: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.2:
+# GCN: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.3:
+# GCN: SI_INIT_M0 3
+
+# GCN: bb.4:
+# GCN-NOT: SI_INIT_M0
+# GCN: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 4
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.5:
+# GCN-NOT: SI_INIT_M0
+# GCN: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 4
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.6:
+# GCN: SI_INIT_M0 -1,
+# GCN-NEXT: DS_WRITE_B32
+# GCN: SI_INIT_M0 %2
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 %2
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+
+---
+name: test
+alignment: 0
+exposesReturnsTwice: false
+noVRegs: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: sreg_32_xm0 }
+body: |
+ bb.0.entry:
+ successors: %bb.1, %bb.2
+
+ %0 = IMPLICIT_DEF
+ %1 = IMPLICIT_DEF
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_CBRANCH_VCCZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4, %bb.5
+ S_CBRANCH_VCCZ %bb.4, implicit undef %vcc
+ S_BRANCH %bb.5
+
+ bb.4:
+ successors: %bb.6
+ SI_INIT_M0 3, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 4, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.6
+
+ bb.5:
+ successors: %bb.6
+ SI_INIT_M0 3, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 4, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.6
+
+ bb.6:
+ successors: %bb.0.entry, %bb.6
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ %2 = IMPLICIT_DEF
+ SI_INIT_M0 %2, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 %2, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_CBRANCH_VCCZ %bb.6, implicit undef %vcc
+ S_BRANCH %bb.0.entry
+
+...
diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
new file mode 100644
index 0000000000000..3a0605fa182a3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -0,0 +1,136 @@
+; RUN: llc -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; Test addressing modes when the scratch base is not a frame index.
+
+; GCN-LABEL: {{^}}store_private_offset_i8:
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @store_private_offset_i8() #0 {
+ store volatile i8 5, i8* inttoptr (i32 8 to i8*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_i16:
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @store_private_offset_i16() #0 {
+ store volatile i16 5, i16* inttoptr (i32 8 to i16*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_i32:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @store_private_offset_i32() #0 {
+ store volatile i32 5, i32* inttoptr (i32 8 to i32*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_v2i32:
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @store_private_offset_v2i32() #0 {
+ store volatile <2 x i32> <i32 5, i32 10>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_v4i32:
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @store_private_offset_v4i32() #0 {
+ store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_offset_i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @load_private_offset_i8() #0 {
+ %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}sextload_private_offset_i8:
+; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
+ %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
+ %sextload = sext i8 %load to i32
+ store i32 %sextload, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}zextload_private_offset_i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
+ %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
+ %zextload = zext i8 %load to i32
+ store i32 %zextload, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_offset_i16:
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @load_private_offset_i16() #0 {
+ %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}sextload_private_offset_i16:
+; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
+ %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
+ %sextload = sext i16 %load to i32
+ store i32 %sextload, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}zextload_private_offset_i16:
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
+ %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
+ %zextload = zext i16 %load to i32
+ store i32 %zextload, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_offset_i32:
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @load_private_offset_i32() #0 {
+ %load = load volatile i32, i32* inttoptr (i32 8 to i32*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_offset_v2i32:
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @load_private_offset_v2i32() #0 {
+ %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_offset_v4i32:
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+define amdgpu_kernel void @load_private_offset_v4i32() #0 {
+ %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095
+define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
+ store volatile i8 5, i8* inttoptr (i32 4095 to i8*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1:
+; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}}
+define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
+ store volatile i8 5, i8* inttoptr (i32 4096 to i8*)
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2:
+; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}}
+define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
+ store volatile i8 5, i8* inttoptr (i32 4097 to i8*)
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 4bd8bff4809af..9d0b6b395996b 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -9,19 +9,18 @@
; StructurizeCFG.
; IR-LABEL: @multi_divergent_region_exit_ret_ret(
-; IR: %Pivot = icmp sge i32 %tmp16, 2
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
-; IR: %1 = extractvalue { i1, i64 } %0, 0
-; IR: %2 = extractvalue { i1, i64 } %0, 1
-; IR: br i1 %1, label %LeafBlock1, label %Flow
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
; IR: Flow:
-; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
-; IR: %6 = extractvalue { i1, i64 } %5, 0
-; IR: %7 = extractvalue { i1, i64 } %5, 1
-; IR: br i1 %6, label %LeafBlock, label %Flow1
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
; IR: LeafBlock:
; IR: br label %Flow1
@@ -30,32 +29,32 @@
; IR: br label %Flow{{$}}
; IR: Flow2:
-; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %16)
-; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
-; IR: %10 = extractvalue { i1, i64 } [[IF]], 0
-; IR: %11 = extractvalue { i1, i64 } [[IF]], 1
-; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
; IR: exit0:
; IR: store volatile i32 9, i32 addrspace(1)* undef
; IR: br label %UnifiedReturnBlock
; IR: Flow1:
-; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
-; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
-; IR: %15 = extractvalue { i1, i64 } %14, 0
-; IR: %16 = extractvalue { i1, i64 } %14, 1
-; IR: br i1 %15, label %exit1, label %Flow2
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
; IR: exit1:
; IR: store volatile i32 17, i32 addrspace(3)* undef
; IR: br label %Flow2
; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
; IR: ret void
@@ -65,9 +64,11 @@
; GCN: s_xor_b64
-; GCN: ; %LeafBlock
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
; GCN: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec
@@ -125,15 +126,14 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
}
; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
-; IR: %Pivot = icmp sge i32 %tmp16, 2
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
-; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
-; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
; IR: UnifiedUnreachableBlock:
@@ -181,49 +181,51 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
}
; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
-; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
; IR: llvm.amdgcn.if
; IR: br i1
; IR: {{^}}Flow:
-; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
-; IR: br i1 %6, label %LeafBlock, label %Flow1
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
; IR: {{^}}LeafBlock:
-; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
; IR: br label %Flow1
; IR: LeafBlock1:
-; IR: %uniform.cond0 = icmp ne i32 %arg3, 2
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
; IR: br label %Flow
; IR: Flow2:
-; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
-; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
; IR: exit0:
; IR: store volatile i32 9, i32 addrspace(1)* undef
; IR: br label %UnifiedReturnBlock
; IR: {{^}}Flow1:
-; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
-; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
-; IR: %15 = extractvalue { i1, i64 } %14, 0
-; IR: %16 = extractvalue { i1, i64 } %14, 1
-; IR: br i1 %15, label %exit1, label %Flow2
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
; IR: exit1:
; IR: store volatile i32 17, i32 addrspace(3)* undef
; IR: br label %Flow2
; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
; IR: ret void
define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
entry:
@@ -262,18 +264,17 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
}
; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
-; IR: %Pivot = icmp sge i32 %tmp16, 2
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
-; IR: br i1 %1, label %LeafBlock1, label %Flow
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
; IR: Flow:
-; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
-; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
entry:
@@ -313,13 +314,13 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
; IR: Flow2:
-; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
-; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %17)
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
; IR: UnifiedReturnBlock:
-; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %12)
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
; IR: ret float %UnifiedRetVal
define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
entry:
@@ -386,32 +387,31 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
}
; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
-; IR: %Pivot = icmp sge i32 %tmp16, 2
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
; IR: Flow:
-; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
; IR: Flow2:
-; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
-; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
; IR: exit0:
; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
; IR-NEXT: br label %UnifiedReturnBlock
; IR: Flow1:
-; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
-; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
-; IR: %15 = extractvalue { i1, i64 } %14, 0
-; IR: %16 = extractvalue { i1, i64 } %14, 1
-; IR: br i1 %15, label %exit1, label %Flow2
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
; IR: exit1:
; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
@@ -419,7 +419,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
; IR-NEXT: br label %Flow2
; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
; IR-NEXT: ret void
define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
entry:
@@ -475,7 +475,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
; IR-NEXT: br label %Flow2
; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
; IR-NEXT: ret void
define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
entry:
@@ -622,15 +622,15 @@ uniform.ret:
; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
-; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
-; IR: br i1 %6, label %uniform.if, label %Flow2
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
; IR: Flow: ; preds = %uniform.then, %uniform.if
-; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ]
-; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
; IR-NEXT: ret void
define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
entry:
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index c0b4eaff60aac..672549c8ea636 100644
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -133,9 +133,9 @@ bb23: ; preds = %bb10
; IR: Flow1:
; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ]
+; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
@@ -144,9 +144,9 @@ bb23: ; preds = %bb10
; IR: Flow2:
; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
@@ -156,15 +156,16 @@ bb23: ; preds = %bb10
; IR: bb21:
; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken)
+; IR-NEXT: %27 = xor i1 %tmp12, true
+; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
; IR-NEXT: br label %Flow3
; IR: Flow3:
; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ]
+; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
+; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
+; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
+; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll
index af26835102938..dcb089010e99d 100644
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=OPTNONE %s
; There are no stack objects, but still a private memory access. The
; private access regiters need to be correctly initialized anyway, and
@@ -27,9 +27,9 @@ define amdgpu_kernel void @store_to_undef() #0 {
; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
-; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
define amdgpu_kernel void @store_to_inttoptr() #0 {
- store volatile i32 0, i32* inttoptr (i32 123 to i32*)
+ store volatile i32 0, i32* inttoptr (i32 124 to i32*)
ret void
}
@@ -47,9 +47,9 @@ define amdgpu_kernel void @load_from_undef() #0 {
; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
-; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
define amdgpu_kernel void @load_from_inttoptr() #0 {
- %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
+ %ld = load volatile i32, i32* inttoptr (i32 124 to i32*)
ret void
}
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
index 5c698c839fa68..d7b353cd25d38 100644
--- a/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -22,4 +22,18 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
ret void
}
+; This test used to crash in ScheduleDAG.
+;
+; GCN-LABEL: {{^}}test_readcyclecounter_smem:
+; SI-DAG: s_memtime
+; VI-DAG: s_memrealtime
+; GCN-DAG: s_load_dword
+define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(2)* inreg %in) #0 {
+ %cycle0 = call i64 @llvm.readcyclecounter()
+ %in.v = load i64, i64 addrspace(2)* %in
+ %r.64 = add i64 %cycle0, %in.v
+ %r.32 = trunc i64 %r.64 to i32
+ ret i32 %r.32
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index 748f98a12c591..f2fbacbab82e7 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -56,7 +56,7 @@ ret.bb: ; preds = %else, %main_body
}
; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
-; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
; GCN: ; BB#{{[0-9]+}}: ; %else
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index b702e1c07200d..160fb6a038fed 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: i16 promotion pass ruins the scalar cases when legal.
diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll
index eac29bad7cf23..115221c5316dc 100644
--- a/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 4e093cdece212..16ce86bf8b115 100644
--- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 0e715c453209e..8f1aebfe9ceb8 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -69,19 +69,20 @@ endif:
; TOSMEM-NOT: s_m0
; TOSMEM: s_add_u32 m0, s7, 0x100
; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
+; FIXME-TOSMEM-NOT: m0
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_mov_b64 exec,
; TOSMEM: s_cbranch_execz
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
@@ -130,7 +131,7 @@ endif: ; preds = %else, %if
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
; GCN-NOT: v_readlane_b32 m0
@@ -159,13 +160,14 @@ endif:
; GCN-LABEL: {{^}}restore_m0_lds:
; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
; TOSMEM: s_cmp_eq_u32
-; TOSMEM-NOT: m0
+; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_cbranch_scc1
; TOSMEM: s_mov_b32 m0, -1
@@ -178,10 +180,10 @@ endif:
; TOSMEM: ds_write_b64
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_waitcnt lgkmcnt(0)
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b32 m0, s0
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
index 69f0accef6282..431344670ffb1 100644
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
index 77ad895d0e86a..51771c9723e00 100644
--- a/test/CodeGen/AMDGPU/trap.ll
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -80,4 +80,25 @@ define amdgpu_kernel void @trap() {
ret void
}
+; GCN-LABEL: {{^}}non_entry_trap:
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+
+; HSA: BB{{[0-9]_[0-9]+]]: ; %trap
+; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-NEXT: s_trap 2
+define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr #1 {
+entry:
+ %tmp29 = load volatile i32, i32 addrspace(1)* %arg0
+ %cmp = icmp eq i32 %tmp29, -1
+ br i1 %cmp, label %ret, label %trap
+
+trap:
+ call void @llvm.trap()
+ unreachable
+
+ret:
+ ret void
+}
+
attributes #0 = { nounwind noreturn }
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 21c774133f896..83ab2659ef4aa 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -5,6 +5,8 @@
define void @test_sext_s8() { ret void }
define void @test_zext_s16() { ret void }
+ define void @test_trunc_s32_16() { ret void }
+
define void @test_add_s8() { ret void }
define void @test_add_s16() { ret void }
define void @test_add_s32() { ret void }
@@ -21,6 +23,9 @@
define void @test_mul_s32() #1 { ret void }
define void @test_mulv5_s32() { ret void }
+ define void @test_sdiv_s32() #2 { ret void }
+ define void @test_udiv_s32() #2 { ret void }
+
define void @test_load_from_stack() { ret void }
define void @test_load_f32() #0 { ret void }
define void @test_load_f64() #0 { ret void }
@@ -28,12 +33,14 @@
define void @test_stores() #0 { ret void }
define void @test_gep() { ret void }
- define void @test_constants() { ret void }
+ define void @test_constant_imm() { ret void }
+ define void @test_constant_cimm() { ret void }
define void @test_soft_fp_double() #0 { ret void }
attributes #0 = { "target-features"="+vfp2,-neonfp" }
attributes #1 = { "target-features"="+v6" }
+ attributes #2 = { "target-features"="+hwdiv-arm" }
...
---
name: test_zext_s1
@@ -142,6 +149,34 @@ body: |
; CHECK: BX_RET 14, _, implicit %r0
...
---
+name: test_trunc_s32_16
+# CHECK-LABEL: name: test_trunc_s32_16
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+# CHECK-DAG: id: 0, class: gpr
+# CHECK-DAG: id: 1, class: gpr
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s32) = COPY %r0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+ %1(s16) = G_TRUNC %0(s32)
+ ; CHECK: [[VREGTRUNC:%[0-9]+]] = COPY [[VREGX]]
+
+ %r0 = COPY %1(s16)
+ ; CHECK: %r0 = COPY [[VREGTRUNC]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
name: test_add_s8
# CHECK-LABEL: name: test_add_s8
legalized: true
@@ -538,6 +573,72 @@ body: |
; CHECK: BX_RET 14, _, implicit %r0
...
---
+name: test_sdiv_s32
+# CHECK-LABEL: name: test_sdiv_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+ - { id: 2, class: gprb }
+# CHECK: id: 0, class: gpr
+# CHECK: id: 1, class: gpr
+# CHECK: id: 2, class: gpr
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+ %1(s32) = COPY %r1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+
+ %2(s32) = G_SDIV %0, %1
+ ; CHECK: [[VREGRES:%[0-9]+]] = SDIV [[VREGX]], [[VREGY]], 14, _
+
+ %r0 = COPY %2(s32)
+ ; CHECK: %r0 = COPY [[VREGRES]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_udiv_s32
+# CHECK-LABEL: name: test_udiv_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+ - { id: 2, class: gprb }
+# CHECK: id: 0, class: gpr
+# CHECK: id: 1, class: gpr
+# CHECK: id: 2, class: gpr
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+ %1(s32) = COPY %r1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+
+ %2(s32) = G_UDIV %0, %1
+ ; CHECK: [[VREGRES:%[0-9]+]] = UDIV [[VREGX]], [[VREGY]], 14, _
+
+ %r0 = COPY %2(s32)
+ ; CHECK: %r0 = COPY [[VREGRES]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
name: test_load_from_stack
# CHECK-LABEL: name: test_load_from_stack
legalized: true
@@ -714,8 +815,8 @@ body: |
BX_RET 14, _, implicit %r0
...
---
-name: test_constants
-# CHECK-LABEL: name: test_constants
+name: test_constant_imm
+# CHECK-LABEL: name: test_constant_imm
legalized: true
regBankSelected: true
selected: false
@@ -732,6 +833,26 @@ body: |
BX_RET 14, _, implicit %r0
...
---
+name: test_constant_cimm
+# CHECK-LABEL: name: test_constant_cimm
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+# CHECK: id: [[C:[0-9]+]], class: gpr
+body: |
+ bb.0:
+ ; Adding a type on G_CONSTANT changes its operand from an Imm into a CImm.
+ ; We still want to see the same thing in the output though.
+ %0(s32) = G_CONSTANT i32 42
+ ; CHECK: %[[C]] = MOVi 42, 14, _, _
+
+ %r0 = COPY %0(s32)
+ BX_RET 14, _, implicit %r0
+...
+---
name: test_soft_fp_double
# CHECK-LABEL: name: test_soft_fp_double
legalized: true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
new file mode 100644
index 0000000000000..2881740b016fd
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
@@ -0,0 +1,68 @@
+; We use V6 ops so we can easily check for the extensions (sxth vs bit tricks).
+; RUN: llc -mtriple arm-gnueabi -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV
+; RUN: llc -mtriple arm-gnueabi -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-AEABI
+; RUN: llc -mtriple arm-gnu -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV
+; RUN: llc -mtriple arm-gnu -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-DEFAULT
+
+define arm_aapcscc i32 @test_sdiv_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_sdiv_i32:
+; HWDIV: sdiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_idiv
+; SOFT-DEFAULT: blx __divsi3
+ %r = sdiv i32 %a, %b
+ ret i32 %r
+}
+
+define arm_aapcscc i32 @test_udiv_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_udiv_i32:
+; HWDIV: udiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_uidiv
+; SOFT-DEFAULT: blx __udivsi3
+ %r = udiv i32 %a, %b
+ ret i32 %r
+}
+
+define arm_aapcscc i16 @test_sdiv_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: test_sdiv_i16:
+; CHECK-DAG: sxth r0, r0
+; CHECK-DAG: sxth r1, r1
+; HWDIV: sdiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_idiv
+; SOFT-DEFAULT: blx __divsi3
+ %r = sdiv i16 %a, %b
+ ret i16 %r
+}
+
+define arm_aapcscc i16 @test_udiv_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: test_udiv_i16:
+; CHECK-DAG: uxth r0, r0
+; CHECK-DAG: uxth r1, r1
+; HWDIV: udiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_uidiv
+; SOFT-DEFAULT: blx __udivsi3
+ %r = udiv i16 %a, %b
+ ret i16 %r
+}
+
+define arm_aapcscc i8 @test_sdiv_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: test_sdiv_i8:
+; CHECK-DAG: sxtb r0, r0
+; CHECK-DAG: sxtb r1, r1
+; HWDIV: sdiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_idiv
+; SOFT-DEFAULT: blx __divsi3
+ %r = sdiv i8 %a, %b
+ ret i8 %r
+}
+
+define arm_aapcscc i8 @test_udiv_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: test_udiv_i8:
+; CHECK-DAG: uxtb r0, r0
+; CHECK-DAG: uxtb r1, r1
+; HWDIV: udiv r0, r0, r1
+; SOFT-AEABI: blx __aeabi_uidiv
+; SOFT-DEFAULT: blx __udivsi3
+ %r = udiv i8 %a, %b
+ ret i8 %r
+}
+
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
index f3ca2915f306e..da02bfe68519d 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
@@ -7,6 +7,14 @@ entry:
ret void
}
+define i32 @test_constant_return_i32() {
+; CHECK-LABEL: test_constant_return_i32:
+; CHECK: mov r0, #42
+; CHECK: bx lr
+entry:
+ ret i32 42
+}
+
define zeroext i1 @test_zext_i1(i1 %x) {
; CHECK-LABEL: test_zext_i1
; CHECK: and r0, r0, #1
@@ -40,6 +48,30 @@ entry:
ret i16 %x
}
+define void @test_trunc_i32_i16(i32 %v, i16 *%p) {
+; CHECK-LABEL: test_trunc_i32_i16:
+; The trunc doesn't result in any instructions, but we
+; expect the store to be explicitly 16-bit.
+; CHECK: strh r0, [r1]
+; CHECK: bx lr
+entry:
+ %v16 = trunc i32 %v to i16
+ store i16 %v16, i16 *%p
+ ret void
+}
+
+define void @test_trunc_i32_i8(i32 %v, i8 *%p) {
+; CHECK-LABEL: test_trunc_i32_i8:
+; The trunc doesn't result in any instructions, but we
+; expect the store to be explicitly 8-bit.
+; CHECK: strb r0, [r1]
+; CHECK: bx lr
+entry:
+ %v8 = trunc i32 %v to i8
+ store i8 %v8, i8 *%p
+ ret void
+}
+
define i8 @test_add_i8(i8 %x, i8 %y) {
; CHECK-LABEL: test_add_i8:
; CHECK: add r0, r0, r1
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
new file mode 100644
index 0000000000000..6f3e09d328cfe
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -0,0 +1,230 @@
+# RUN: llc -mtriple arm-linux-gnueabi -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV
+# RUN: llc -mtriple arm-linux-gnueabi -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-AEABI
+# RUN: llc -mtriple arm-linux-gnu -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV
+# RUN: llc -mtriple arm-linux-gnu -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-DEFAULT
+--- |
+ define void @test_sdiv_i32() { ret void }
+ define void @test_udiv_i32() { ret void }
+
+ define void @test_sdiv_i16() { ret void }
+ define void @test_udiv_i16() { ret void }
+
+ define void @test_sdiv_i8() { ret void }
+ define void @test_udiv_i8() { ret void }
+...
+---
+name: test_sdiv_i32
+# CHECK-LABEL: name: test_sdiv_i32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ %2(s32) = G_SDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s32)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_udiv_i32
+# CHECK-LABEL: name: test_udiv_i32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ %2(s32) = G_UDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s32)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_sdiv_i16
+# CHECK-LABEL: name: test_sdiv_i16
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+ ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16)
+ ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16)
+ %0(s16) = COPY %r0
+ %1(s16) = COPY %r1
+ ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X32]]
+ ; SOFT-DAG: %r1 = COPY [[Y32]]
+ ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ %2(s16) = G_SDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s16)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_udiv_i16
+# CHECK-LABEL: name: test_udiv_i16
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+ ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16)
+ ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16)
+ %0(s16) = COPY %r0
+ %1(s16) = COPY %r1
+ ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X32]]
+ ; SOFT-DAG: %r1 = COPY [[Y32]]
+ ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ %2(s16) = G_UDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s16)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_sdiv_i8
+# CHECK-LABEL: name: test_sdiv_i8
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+ ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8)
+ ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8)
+ %0(s8) = COPY %r0
+ %1(s8) = COPY %r1
+ ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X32]]
+ ; SOFT-DAG: %r1 = COPY [[Y32]]
+ ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ %2(s8) = G_SDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s8)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_udiv_i8
+# CHECK-LABEL: name: test_udiv_i8
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+ ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8)
+ ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8)
+ %0(s8) = COPY %r0
+ %1(s8) = COPY %r1
+ ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X32]]
+ ; SOFT-DAG: %r1 = COPY [[Y32]]
+ ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ %2(s8) = G_UDIV %0, %1
+ ; CHECK: %r0 = COPY [[R]]
+ %r0 = COPY %2(s8)
+ BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index e7935832f98a8..4e94fb4e34819 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -13,6 +13,9 @@
define void @test_mul_s16() { ret void }
define void @test_mul_s8() { ret void }
+ define void @test_sdiv_s32() #1 { ret void }
+ define void @test_udiv_s32() #1 { ret void }
+
define void @test_loads() #0 { ret void }
define void @test_stores() #0 { ret void }
@@ -22,12 +25,15 @@
define void @test_constants() { ret void }
+ define void @test_trunc_s32_16() { ret void }
+
define void @test_fadd_s32() #0 { ret void }
define void @test_fadd_s64() #0 { ret void }
define void @test_soft_fp_s64() #0 { ret void }
attributes #0 = { "target-features"="+vfp2"}
+ attributes #1 = { "target-features"="+hwdiv-arm" }
...
---
name: test_add_s32
@@ -290,6 +296,58 @@ body: |
...
---
+name: test_sdiv_s32
+# CHECK-LABEL: name: test_sdiv_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = G_SDIV %0, %1
+ %r0 = COPY %2(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
+name: test_udiv_s32
+# CHECK-LABEL: name: test_udiv_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = G_UDIV %0, %1
+ %r0 = COPY %2(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
name: test_loads
# CHECK-LABEL: name: test_loads
legalized: true
@@ -442,6 +500,27 @@ body: |
BX_RET 14, _, implicit %r0
...
---
+name: test_trunc_s32_16
+# CHECK-LABEL: name: test_trunc_s32_16
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s32) = COPY %r0
+ %1(s16) = G_TRUNC %0(s32)
+ %r0 = COPY %1(s16)
+ BX_RET 14, _, implicit %r0
+...
+---
name: test_fadd_s32
# CHECK-LABEL: name: test_fadd_s32
legalized: true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
new file mode 100644
index 0000000000000..e3680ed2b9298
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple arm-unknown -verify-machineinstrs -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2>&1 | FileCheck %s
+
+; This file checks that we use the fallback path for things that are known to
+; be unsupported on the ARM target. It should progressively shrink in size.
+
+define <4 x i32> @test_int_vectors(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: <4 x i32> (<4 x i32>, <4 x i32>)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_int_vectors
+ %res = add <4 x i32> %a, %b
+ ret <4 x i32> %res
+}
+
+define <4 x float> @test_float_vectors(<4 x float> %a, <4 x float> %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: <4 x float> (<4 x float>, <4 x float>)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_float_vectors
+ %res = fadd <4 x float> %a, %b
+ ret <4 x float> %res
+}
+
+define i64 @test_i64(i64 %a, i64 %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: i64 (i64, i64)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_i64
+ %res = add i64 %a, %b
+ ret i64 %res
+}
+
+define i128 @test_i128(i128 %a, i128 %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: i128 (i128, i128)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_i128
+ %res = add i128 %a, %b
+ ret i128 %res
+}
+
+define i17 @test_funny_ints(i17 %a, i17 %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: i17 (i17, i17)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_funny_ints
+ %res = add i17 %a, %b
+ ret i17 %res
+}
+
+define half @test_half(half %a, half %b) {
+; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
+ %res = fadd half %a, %b
+ ret half %res
+}
+
+; On ARM, clang lowers structs to arrays.
+define void @test_arrays([2 x i32] %this.could.come.from.a.struct) {
+; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays
+ ret void
+}
+
+define void @test_structs({i32, i32} %struct) {
+; CHECK: remark: {{.*}} unable to lower arguments: void ({ i32, i32 })*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_structs
+ ret void
+}
+
+define void @test_vararg_definition(i32 %a, ...) {
+; CHECK: remark: {{.*}} unable to lower arguments: void (i32, ...)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_definition
+ ret void
+}
+
+define void @test_vararg_call(i32 %a) {
+; CHECK: remark: {{.*}} unable to translate instruction: call
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_call
+ call void(i32, ...) @test_vararg_definition(i32 %a, i32 %a, i32 %a)
+ ret void
+}
+
+define i32 @test_thumb(i32 %a) #0 {
+; CHECK: remark: {{.*}} unable to lower arguments: i32 (i32)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_thumb
+ ret i32 %a
+}
+
+attributes #0 = { "target-features"="+thumb-mode" }
diff --git a/test/CodeGen/ARM/bool-ext-inc.ll b/test/CodeGen/ARM/bool-ext-inc.ll
index fe43f1b2ef93d..b91b9b258991f 100644
--- a/test/CodeGen/ARM/bool-ext-inc.ll
+++ b/test/CodeGen/ARM/bool-ext-inc.ll
@@ -30,3 +30,42 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) {
ret <4 x i32> %add
}
+define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: cmpgt_sext_inc_vec:
+; CHECK: @ BB#0:
+; CHECK-NEXT: mov r12, sp
+; CHECK-NEXT: vmov d19, r2, r3
+; CHECK-NEXT: vmov.i32 q10, #0x1
+; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT: vmov d18, r0, r1
+; CHECK-NEXT: vcgt.s32 q8, q9, q8
+; CHECK-NEXT: vadd.i32 q8, q8, q10
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %cmp = icmp sgt <4 x i32> %x, %y
+ %ext = sext <4 x i1> %cmp to <4 x i32>
+ %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: cmpne_sext_inc_vec:
+; CHECK: @ BB#0:
+; CHECK-NEXT: mov r12, sp
+; CHECK-NEXT: vmov d19, r2, r3
+; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT: vmov d18, r0, r1
+; CHECK-NEXT: vceq.i32 q8, q9, q8
+; CHECK-NEXT: vmov.i32 q9, #0x1
+; CHECK-NEXT: vmvn q8, q8
+; CHECK-NEXT: vadd.i32 q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %cmp = icmp ne <4 x i32> %x, %y
+ %ext = sext <4 x i1> %cmp to <4 x i32>
+ %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll
new file mode 100644
index 0000000000000..ec032ccac423c
--- /dev/null
+++ b/test/CodeGen/ARM/fence-singlethread.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -filetype=obj -o %t
+; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ
+
+; OBJ-NOT: dmb
+
+define void @fence_singlethread() {
+; CHECK-LABEL: fence_singlethread:
+; CHECK-NOT: dmb
+; CHECK: @ COMPILER BARRIER
+; CHECK-NOT: dmb
+
+ fence singlethread seq_cst
+ ret void
+}
diff --git a/test/CodeGen/ARM/v6m-smul-with-overflow.ll b/test/CodeGen/ARM/v6m-smul-with-overflow.ll
new file mode 100644
index 0000000000000..6e8a7041de2b9
--- /dev/null
+++ b/test/CodeGen/ARM/v6m-smul-with-overflow.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s
+
+define i1 @signed_multiplication_did_overflow(i32, i32) {
+; CHECK-LABEL: signed_multiplication_did_overflow:
+entry-block:
+ %2 = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %0, i32 %1)
+ %3 = extractvalue { i32, i1 } %2, 1
+ ret i1 %3
+
+; CHECK: mov r2, r1
+; CHECK: asrs r1, r0, #31
+; CHECK: asrs r3, r2, #31
+; CHECK: bl __aeabi_lmul
+}
+
+declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index 1aa23597cf499..3409d37a31f4c 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -485,6 +485,26 @@ define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
ret <2 x i16> %x
}
+; And <2 x i8> to <2 x i32>
+define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_2i8:
+; CHECK: vadd.i32
+ %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2>
+ %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3>
+ %x = add <2 x i8> %tmp2, %tmp1
+ ret <2 x i8> %x
+}
+
+define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_2i16:
+; CHECK: vadd.i32
+ %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
+ %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3>
+ %x = add <2 x i16> %tmp2, %tmp1
+ ret <2 x i16> %x
+}
+
+
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
new file mode 100644
index 0000000000000..b19e44e29fb66
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
@@ -0,0 +1,35 @@
+# RUN: llc -O0 %s -o - -march=avr | FileCheck %s
+
+# This test checks the expansion of the 16-bit 'LDDWRdPtrQ' pseudo instruction.
+#
+# This test ensures that the pseudo expander can correctly handle the case
+# where we are expanding a 16-bit LDD instruction where the source and
+# destination registers are the same.
+#
+# The instruction itself is earlyclobber and so ISel will never produce an
+# instruction like this, but the stack slot loading can and will.
+
+--- |
+ target triple = "avr--"
+ define void @test_lddwrdptrq() {
+ entry:
+ ret void
+ }
+...
+
+---
+name: test_lddwrdptrq
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+
+ ; CHECK-LABEL: test_lddwrdptrq
+
+ ; CHECK: ldd [[SCRATCH:r[0-9]+]], Z+10
+ ; CHECK-NEXT: push [[SCRATCH]]
+ ; CHECK-NEXT: ldd [[SCRATCH]], Z+11
+ ; CHECK-NEXT: mov r31, [[SCRATCH]]
+ ; CHECK-NEXT: pop r30
+
+ early-clobber %r31r30 = LDDWRdPtrQ undef %r31r30, 10
+...
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir
new file mode 100644
index 0000000000000..3e7fdcd400d21
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir
@@ -0,0 +1,29 @@
+# RUN: llc -O0 %s -o - | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction.
+
+--- |
+ target triple = "avr--"
+ define void @test_ldwrdptr() {
+ entry:
+ ret void
+ }
+...
+
+---
+name: test_ldwrdptr
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+
+ ; CHECK-LABEL: test_ldwrdptr
+
+ ; CHECK: ld [[SCRATCH:r[0-9]+]], Z
+ ; CHECK-NEXT: push [[SCRATCH]]
+ ; CHECK-NEXT: ldd [[SCRATCH]], Z+1
+ ; CHECK-NEXT: mov r31, [[SCRATCH]]
+ ; CHECK-NEXT: pop r30
+
+ early-clobber %r31r30 = LDWRdPtr undef %r31r30
+...
+
diff --git a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
deleted file mode 100644
index 8427a2bfb4edf..0000000000000
--- a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
+++ /dev/null
@@ -1,35 +0,0 @@
-# RUN: llc -O0 %s -o - -march=avr | FileCheck %s
-
-# This test ensures that the pseudo expander can correctly handle the case
-# where we are expanding a 16-bit LDD instruction where the source and
-# destination registers are the same.
-#
-# The instruction itself is earlyclobber and so ISel will never produce an
-# instruction like this, but the stack slot loading can and will.
-
---- |
- target triple = "avr--"
-
- define void @test_lddw() {
- entry:
- ret void
- }
-
-...
----
-name: test_lddw
-tracksRegLiveness: true
-stack:
- - { id: 0, type: spill-slot, offset: -4, size: 1, alignment: 1, callee-saved-register: '%r28' }
-body: |
- bb.0.entry:
- liveins: %r28, %r29
-
- ; CHECK-LABEL: test_lddw
-
- ; CHECK: ldd [[TMPREG:r[0-9]+]], Y+0
- ; CHECK-NEXT: mov r28, [[TMPREG]]
- ; CHECK-NEXT: ldd [[TMPREG]], Y+1
- ; CHECK-NEXT: mov r29, [[TMPREG]]
- dead early-clobber %r29r28 = LDDWRdYQ killed %r29r28, 0
-...
diff --git a/test/CodeGen/MSP430/select-use-sr.ll b/test/CodeGen/MSP430/select-use-sr.ll
new file mode 100644
index 0000000000000..3f67fb85f793f
--- /dev/null
+++ b/test/CodeGen/MSP430/select-use-sr.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=msp430 | FileCheck %s
+; PR32769
+
+target triple = "msp430"
+
+; Test that CMP instruction is not removed by MachineCSE.
+;
+; CHECK-LABEL: @f
+; CHECK: cmp.w r15, r13
+; CHECK: cmp.w r15, r13
+; CHECK-NEXT: jeq .LBB0_2
+define i16 @f(i16, i16, i16, i16) {
+entry:
+ %4 = icmp ult i16 %1, %3
+ %5 = zext i1 %4 to i16
+ %6 = icmp ult i16 %0, %2
+ %7 = zext i1 %6 to i16
+ %8 = icmp eq i16 %1, %3
+ %out = select i1 %8, i16 %5, i16 %7
+ ret i16 %out
+}
diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll
index 1562372ce9a09..20853073dfa6f 100644
--- a/test/CodeGen/Mips/llvm-ir/mul.ll
+++ b/test/CodeGen/Mips/llvm-ir/mul.ll
@@ -268,7 +268,7 @@ entry:
; MM64R6: daddu $2, $[[T1]], $[[T0]]
; MM64R6-DAG: dmul $3, $5, $7
- ; MM32: lw $25, %call16(__multi3)($16)
+ ; MM32: lw $25, %call16(__multi3)($gp)
%r = mul i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll
index defd25bb41acf..ee2b212a9f2f4 100644
--- a/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -172,7 +172,7 @@ entry:
; 64R6: ddiv $2, $4, $5
; 64R6: teq $5, $zero, 7
- ; MM32: lw $25, %call16(__divdi3)($2)
+ ; MM32: lw $25, %call16(__divdi3)($gp)
; MM64: ddiv $2, $4, $5
; MM64: teq $5, $zero, 7
@@ -184,15 +184,7 @@ entry:
define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: sdiv_i128:
-
- ; GP32: lw $25, %call16(__divti3)($gp)
-
- ; GP64-NOT-R6: ld $25, %call16(__divti3)($gp)
- ; 64R6: ld $25, %call16(__divti3)($gp)
-
- ; MM32: lw $25, %call16(__divti3)($16)
-
- ; MM64: ld $25, %call16(__divti3)($2)
+ ; ALL: l{{w|d}} $25, %call16(__divti3)($gp)
%r = sdiv i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll
index 42664d7457e5a..812c105669799 100644
--- a/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -164,7 +164,7 @@ entry:
; 64R6: dmod $2, $4, $5
; 64R6: teq $5, $zero, 7
- ; MM32: lw $25, %call16(__moddi3)($2)
+ ; MM32: lw $25, %call16(__moddi3)($gp)
; MM64: dmod $2, $4, $5
; MM64: teq $5, $zero, 7
@@ -177,14 +177,7 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: srem_i128:
- ; GP32: lw $25, %call16(__modti3)($gp)
-
- ; GP64-NOT-R6: ld $25, %call16(__modti3)($gp)
- ; 64R6: ld $25, %call16(__modti3)($gp)
-
- ; MM32: lw $25, %call16(__modti3)($16)
-
- ; MM64: ld $25, %call16(__modti3)($2)
+ ; ALL: l{{w|d}} $25, %call16(__modti3)($gp)
%r = srem i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll
index 78ab36442a9ae..6e078fdedfca3 100644
--- a/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -134,7 +134,7 @@ entry:
; 64R6: ddivu $2, $4, $5
; 64R6: teq $5, $zero, 7
- ; MM32: lw $25, %call16(__udivdi3)($2)
+ ; MM32: lw $25, %call16(__udivdi3)($gp)
; MM64: ddivu $2, $4, $5
; MM64: teq $5, $zero, 7
@@ -147,14 +147,7 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: udiv_i128:
- ; GP32: lw $25, %call16(__udivti3)($gp)
-
- ; GP64-NOT-R6: ld $25, %call16(__udivti3)($gp)
- ; 64-R6: ld $25, %call16(__udivti3)($gp)
-
- ; MM32: lw $25, %call16(__udivti3)($16)
-
- ; MM64: ld $25, %call16(__udivti3)($2)
+ ; ALL: l{{w|d}} $25, %call16(__udivti3)($gp)
%r = udiv i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll
index 160c126c7e3ab..3bc82ceecd2a6 100644
--- a/test/CodeGen/Mips/llvm-ir/urem.ll
+++ b/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -190,7 +190,7 @@ entry:
; 64R6: dmodu $2, $4, $5
; 64R6: teq $5, $zero, 7
- ; MM32: lw $25, %call16(__umoddi3)($2)
+ ; MM32: lw $25, %call16(__umoddi3)($gp)
; MM64: dmodu $2, $4, $5
; MM64: teq $5, $zero, 7
@@ -208,9 +208,9 @@ entry:
; GP64-NOT-R6: ld $25, %call16(__umodti3)($gp)
; 64R6: ld $25, %call16(__umodti3)($gp)
- ; MM32: lw $25, %call16(__umodti3)($16)
+ ; MM32: lw $25, %call16(__umodti3)($gp)
- ; MM64: ld $25, %call16(__umodti3)($2)
+ ; MM64: ld $25, %call16(__umodti3)($gp)
%r = urem i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/micromips-gp-rc.ll b/test/CodeGen/Mips/micromips-gp-rc.ll
index f139f7a8486da..16e55c357db68 100644
--- a/test/CodeGen/Mips/micromips-gp-rc.ll
+++ b/test/CodeGen/Mips/micromips-gp-rc.ll
@@ -14,5 +14,5 @@ entry:
; Function Attrs: noreturn
declare void @exit(i32 signext)
-; CHECK: move $gp, ${{[0-9]+}}
+; CHECK: addu $gp, ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 564ffdd2f691b..6fa506849ee6b 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64
@f0 = common global float 0.000000e+00, align 4
@d0 = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll
index 3f04e1cf30531..01a9b64ba63c6 100644
--- a/test/CodeGen/Mips/tailcall/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall/tailcall.ll
@@ -176,7 +176,7 @@ entry:
; ALL-LABEL: caller8_1:
; PIC32: jalr $25
; PIC32R6: jalr $25
-; PIC32MM: jalr $25
+; PIC32MM: jalr{{.*}} $25
; STATIC32: jal
; PIC64: jalr $25
; STATIC64: jal
@@ -288,7 +288,7 @@ entry:
; ALL-LABEL: caller13:
; PIC32: jalr $25
; PIC32R6: jalr $25
-; PIC32MM: jalr $25
+; PIC32MM: jalr{{.*}} $25
; STATIC32: jal
; STATIC64: jal
; PIC64R6: jalr $25
diff --git a/test/CodeGen/PowerPC/empty-functions.ll b/test/CodeGen/PowerPC/empty-functions.ll
index 56db8f39bffdd..b8394e14318fb 100644
--- a/test/CodeGen/PowerPC/empty-functions.ll
+++ b/test/CodeGen/PowerPC/empty-functions.ll
@@ -24,9 +24,7 @@ entry:
; LINUX-NO-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]]
; LINUX-NO-FP-NEXT: .cfi_endproc
-; A cfi directive can point to the end of a function. It (and in fact the
-; entire body) could be optimized out because of the unreachable, but we
-; don't do it right now.
+; A cfi directive cannot point to the end of a function.
; LINUX-FP: func:
; LINUX-FP-NEXT: {{^}}.L[[BEGIN:.*]]:{{$}}
; LINUX-FP-NEXT: .cfi_startproc
@@ -38,8 +36,6 @@ entry:
; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
; LINUX-FP-NEXT: .cfi_offset r31, -4
; LINUX-FP-NEXT: mr 31, 1
-; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
-; LINUX-FP-NEXT: .cfi_def_cfa_register r31
; LINUX-FP-NEXT: {{^}}.L[[END:.*]]:{{$}}
; LINUX-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]]
; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/SPARC/empty-functions.ll b/test/CodeGen/SPARC/empty-functions.ll
index 1f8c5e3a312d0..974df232033a5 100644
--- a/test/CodeGen/SPARC/empty-functions.ll
+++ b/test/CodeGen/SPARC/empty-functions.ll
@@ -14,19 +14,11 @@ entry:
; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func
; LINUX-NO-FP-NEXT: .cfi_endproc
-; A cfi directive can point to the end of a function. It (and in fact the
-; entire body) could be optimized out because of the unreachable, but we
-; don't do it right now.
+; A cfi directive cannot point to the end of a function.
; LINUX-FP: func:
; LINUX-FP-NEXT: .cfi_startproc
; LINUX-FP-NEXT: {{^}}!
; LINUX-FP-NEXT: save %sp, -96, %sp
; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
-; LINUX-FP-NEXT: .cfi_def_cfa_register %fp
-; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
-; LINUX-FP-NEXT: .cfi_window_save
-; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
-; LINUX-FP-NEXT: .cfi_register 15, 31
-; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
; LINUX-FP-NEXT: .size func, .Lfunc_end0-func
; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll
new file mode 100644
index 0000000000000..fc3b7ef1dadeb
--- /dev/null
+++ b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll
@@ -0,0 +1,229 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs -disable-lsr | FileCheck %s
+;
+; Regression test for a machine verifier complaint discovered with llvm-stress.
+; Test that splitting of a 128 bit store does not result in use of undef phys reg.
+; This test case involved spilling of 128 bits, where the data operand was killed.
+
+define void @autogen_SD15107(i8*, i32*, i64*, i32, i64, i8) {
+; CHECK: .text
+BB:
+ %A4 = alloca double
+ %A1 = alloca i32
+ %L = load i8, i8* %0
+ br label %CF331
+
+CF331: ; preds = %CF331, %BB
+ %Shuff = shufflevector <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, <8 x i32> <i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+ %L5 = load i8, i8* %0
+ %FC9 = fptosi float 0xC59D259100000000 to i8
+ %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 10, i32 undef, i32 14, i32 0, i32 undef, i32 4, i32 6, i32 8>
+ %Tr = trunc <8 x i16> zeroinitializer to <8 x i1>
+ %Sl16 = select i1 true, i64 448097, i64 253977
+ %E18 = extractelement <2 x i1> zeroinitializer, i32 1
+ br i1 %E18, label %CF331, label %CF350
+
+CF350: ; preds = %CF331
+ %Cmp22 = icmp slt i8 %L, -1
+ br label %CF
+
+CF: ; preds = %CF333, %CF364, %CF, %CF350
+ %Shuff25 = shufflevector <16 x i1> zeroinitializer, <16 x i1> zeroinitializer, <16 x i32> <i32 25, i32 27, i32 29, i32 31, i32 1, i32 undef, i32 undef, i32 7, i32 9, i32 11, i32 undef, i32 15, i32 17, i32 19, i32 21, i32 23>
+ %B27 = mul <8 x i8> zeroinitializer, %Shuff
+ %L31 = load i8, i8* %0
+ store i8 %L5, i8* %0
+ %E32 = extractelement <8 x i64> %Shuff13, i32 5
+ %Sl37 = select i1 %E18, i64* %2, i64* %2
+ %E40 = extractelement <8 x i64> %Shuff13, i32 4
+ %I42 = insertelement <8 x i64> %Shuff13, i64 0, i32 1
+ %Sl44 = select i1 true, double* %A4, double* %A4
+ %L46 = load i64, i64* %Sl37
+ br i1 undef, label %CF, label %CF335
+
+CF335: ; preds = %CF335, %CF
+ %Shuff48 = shufflevector <8 x i16> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> <i32 undef, i32 15, i32 undef, i32 3, i32 5, i32 7, i32 9, i32 11>
+ %B50 = sub <8 x i64> undef, zeroinitializer
+ %Se = sext i1 %Cmp22 to i64
+ %Cmp52 = icmp ule i64 %E40, 184653
+ br i1 %Cmp52, label %CF335, label %CF364
+
+CF364: ; preds = %CF335
+ store i64 %E32, i64* %Sl37
+ %B57 = udiv <8 x i64> %I42, %B50
+ %L61 = load i64, i64* %Sl37
+ %Sl65 = select i1 undef, i1 %Cmp52, i1 true
+ br i1 %Sl65, label %CF, label %CF333
+
+CF333: ; preds = %CF364
+ %Cmp66 = fcmp uge float 0x474A237E00000000, undef
+ br i1 %Cmp66, label %CF, label %CF324
+
+CF324: ; preds = %CF358, %CF360, %CF333
+ %L67 = load i64, i64* %Sl37
+ %Sl73 = select i1 %E18, i8 %L, i8 %L31
+ %ZE = zext i1 true to i32
+ %Cmp81 = icmp ult i64 184653, %L46
+ br label %CF346
+
+CF346: ; preds = %CF363, %CF346, %CF324
+ %L82 = load double, double* %Sl44
+ store i64 %Se, i64* %Sl37
+ br i1 undef, label %CF346, label %CF363
+
+CF363: ; preds = %CF346
+ %I85 = insertelement <8 x i64> undef, i64 0, i32 4
+ %Se86 = sext i1 %Cmp81 to i64
+ %Cmp88 = icmp eq <16 x i1> zeroinitializer, undef
+ %Shuff91 = shufflevector <8 x i64> %B57, <8 x i64> %I42, <8 x i32> <i32 1, i32 undef, i32 5, i32 7, i32 undef, i32 11, i32 13, i32 undef>
+ %Sl95 = select i1 undef, i8 -1, i8 %5
+ store i8 %FC9, i8* %0
+ %Sl102 = select i1 %Sl65, float 0x3AAFABC380000000, float undef
+ %L104 = load i64, i64* %Sl37
+ store i8 %Sl95, i8* %0
+ br i1 undef, label %CF346, label %CF360
+
+CF360: ; preds = %CF363
+ %I107 = insertelement <16 x i1> undef, i1 %Sl65, i32 3
+ %B108 = fdiv float undef, %Sl102
+ %FC109 = sitofp <16 x i1> %Shuff25 to <16 x float>
+ %Cmp111 = icmp slt i8 %Sl73, %Sl95
+ br i1 %Cmp111, label %CF324, label %CF344
+
+CF344: ; preds = %CF344, %CF360
+ store i64 %4, i64* %Sl37
+ br i1 undef, label %CF344, label %CF358
+
+CF358: ; preds = %CF344
+ %B116 = add i8 29, %5
+ %Sl118 = select i1 %Cmp81, <8 x i1> undef, <8 x i1> %Tr
+ %L120 = load i16, i16* undef
+ store i8 %FC9, i8* %0
+ %E121 = extractelement <16 x i1> %Shuff25, i32 3
+ br i1 %E121, label %CF324, label %CF325
+
+CF325: ; preds = %CF362, %CF358
+ %I123 = insertelement <8 x i16> undef, i16 %L120, i32 0
+ %Sl125 = select i1 undef, i32 undef, i32 199785
+ %Cmp126 = icmp ule <16 x i1> undef, %Cmp88
+ br label %CF356
+
+CF356: ; preds = %CF356, %CF325
+ %FC131 = sitofp <8 x i8> %B27 to <8 x double>
+ store i8 %Sl73, i8* %0
+ store i64 396197, i64* %Sl37
+ %L150 = load i64, i64* %Sl37
+ %Cmp157 = icmp ult i64 %L150, %L61
+ br i1 %Cmp157, label %CF356, label %CF359
+
+CF359: ; preds = %CF359, %CF356
+ %B162 = srem <8 x i64> %I85, %Shuff13
+ %Tr163 = trunc i64 %Se to i8
+ %Sl164 = select i1 %Cmp52, i32* %A1, i32* %1
+ store i64 %E32, i64* undef
+ %I168 = insertelement <8 x i16> %I123, i16 undef, i32 5
+ %Se170 = sext i1 %Cmp81 to i32
+ %Cmp172 = icmp uge i8 %Sl73, %Sl73
+ br i1 %Cmp172, label %CF359, label %CF362
+
+CF362: ; preds = %CF359
+ store i16 0, i16* undef
+ store i64 448097, i64* %Sl37
+ %E189 = extractelement <8 x i16> %Shuff48, i32 6
+ %Sl194 = select i1 %Cmp111, i8 29, i8 0
+ %Cmp195 = icmp eq i32 %ZE, %ZE
+ br i1 %Cmp195, label %CF325, label %CF326
+
+CF326: ; preds = %CF342, %CF362
+ store i64 %L104, i64* undef
+ br label %CF342
+
+CF342: ; preds = %CF326
+ %Cmp203 = icmp ule i1 %Cmp195, %E18
+ br i1 %Cmp203, label %CF326, label %CF337
+
+CF337: ; preds = %CF342
+ br label %CF327
+
+CF327: ; preds = %CF336, %CF355, %CF327, %CF337
+ store i64 %Se86, i64* undef
+ %Tr216 = trunc i64 184653 to i16
+ %Sl217 = select i1 %Cmp157, <4 x i1> undef, <4 x i1> undef
+ %Cmp218 = icmp slt i32 undef, %Se170
+ br i1 %Cmp218, label %CF327, label %CF355
+
+CF355: ; preds = %CF327
+ %E220 = extractelement <16 x i1> %Cmp126, i32 3
+ br i1 %E220, label %CF327, label %CF340
+
+CF340: ; preds = %CF355
+ %Sl224 = select i1 %Sl65, double undef, double 0xBE278346AB25A5C4
+ br label %CF334
+
+CF334: ; preds = %CF343, %CF334, %CF340
+ %L226 = load i64, i64* undef
+ store i32 %3, i32* %Sl164
+ %Cmp233 = icmp uge i16 %Tr216, %L120
+ br i1 %Cmp233, label %CF334, label %CF354
+
+CF354: ; preds = %CF334
+ store i64 %L226, i64* %Sl37
+ %Cmp240 = icmp uge i1 %Cmp52, undef
+ %Shuff243 = shufflevector <16 x i1> %I107, <16 x i1> undef, <16 x i32> <i32 28, i32 30, i32 undef, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 undef>
+ %B245 = fmul <16 x float> %FC109, %FC109
+ br label %CF343
+
+CF343: ; preds = %CF354
+ %Cmp248 = icmp sgt i8 0, %B116
+ br i1 %Cmp248, label %CF334, label %CF336
+
+CF336: ; preds = %CF343
+ store i64 %E32, i64* undef
+ br i1 undef, label %CF327, label %CF328
+
+CF328: ; preds = %CF345, %CF336
+ br label %CF345
+
+CF345: ; preds = %CF328
+ %E257 = extractelement <4 x i1> %Sl217, i32 2
+ br i1 %E257, label %CF328, label %CF338
+
+CF338: ; preds = %CF345
+ %Sl261 = select i1 %E121, <8 x i16> zeroinitializer, <8 x i16> undef
+ %Cmp262 = icmp sgt i8 undef, %Sl194
+ br label %CF329
+
+CF329: ; preds = %CF339, %CF348, %CF357, %CF338
+ store i64 %L67, i64* %Sl37
+ br label %CF357
+
+CF357: ; preds = %CF329
+ %Cmp275 = icmp ne i1 %Cmp203, %Sl65
+ br i1 %Cmp275, label %CF329, label %CF348
+
+CF348: ; preds = %CF357
+ %Shuff286 = shufflevector <8 x i16> undef, <8 x i16> %Sl261, <8 x i32> <i32 6, i32 8, i32 10, i32 12, i32 undef, i32 0, i32 2, i32 4>
+ %Cmp291 = icmp ne i32 %Sl125, undef
+ br i1 %Cmp291, label %CF329, label %CF339
+
+CF339: ; preds = %CF348
+ %Cmp299 = fcmp ugt double %L82, undef
+ br i1 %Cmp299, label %CF329, label %CF330
+
+CF330: ; preds = %CF361, %CF330, %CF339
+ %E301 = extractelement <8 x double> %FC131, i32 3
+ store i64 %Sl16, i64* %Sl37
+ %Se313 = sext <8 x i1> %Sl118 to <8 x i32>
+ %Cmp315 = icmp sgt i8 %Tr163, %L
+ br i1 %Cmp315, label %CF330, label %CF361
+
+CF361: ; preds = %CF330
+ store i16 %L120, i16* undef
+ %Shuff318 = shufflevector <8 x i64> %B162, <8 x i64> undef, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
+ %ZE321 = zext i16 %E189 to i64
+ %Sl322 = select i1 %Cmp240, i1 %Cmp262, i1 %Cmp291
+ br i1 %Sl322, label %CF330, label %CF351
+
+CF351: ; preds = %CF361
+ store double %Sl224, double* %Sl44
+ store i32 %ZE, i32* %Sl164
+ ret void
+}
diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll
index c549bd425aafe..13951ef4354b4 100644
--- a/test/CodeGen/Thumb/long.ll
+++ b/test/CodeGen/Thumb/long.ll
@@ -206,3 +206,34 @@ entry:
; CHECK: adds r0, r0, r2
; CHECK: sbcs r1, r3
}
+
+declare void @f13(i64 %x)
+
+define void @f14(i1 %x, i64 %y) #0 {
+; CHECK-LABEL: f14:
+entry:
+ %a = add i64 %y, 47
+ call void @f13(i64 %a)
+; CHECK: bl
+ br i1 %x, label %if.end, label %if.then
+
+if.then:
+ call void @f13(i64 %y)
+; CHECK: bl
+ br label %if.end
+
+if.end:
+ %b = add i64 %y, 45
+ call void @f13(i64 %b)
+; CHECK: adds
+; CHECK: adcs
+; CHECK: bl
+ %c = add i64 %y, 47
+ call void @f13(i64 %c)
+; CHECK: adds
+; CHECK-NEXT: adcs
+; CHECK: bl
+ ret void
+}
+
+attributes #0 = { optsize }
diff --git a/test/CodeGen/Thumb/optionaldef-scheduling.ll b/test/CodeGen/Thumb/optionaldef-scheduling.ll
new file mode 100644
index 0000000000000..bd091cf2b6f84
--- /dev/null
+++ b/test/CodeGen/Thumb/optionaldef-scheduling.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv6-eabi %s -verify-machineinstrs -o - | FileCheck %s
+
+define i1 @test(i64 %arg) {
+entry:
+ %ispos = icmp sgt i64 %arg, -1
+ %neg = sub i64 0, %arg
+ %sel = select i1 %ispos, i64 %arg, i64 %neg
+ %cmp2 = icmp eq i64 %sel, %arg
+ ret i1 %cmp2
+}
+
+; The scheduler used to ignore OptionalDefs, and could unwittingly insert
+; a flag-setting instruction in between an ADDS and the corresponding ADC.
+
+; CHECK: adds
+; CHECK-NOT: eors
+; CHECK: adcs
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
new file mode 100644
index 0000000000000..ec62ece6d408b
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL
+; RUN: llc -mtriple=x86_64-linux-gnu < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL
+
+define i32 @test_ret_i32() {
+; X32-LABEL: test_ret_i32:
+; X32: # BB#0:
+; X32-NEXT: movl $20, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_ret_i32:
+; X64: # BB#0:
+; X64-NEXT: movl $20, %eax
+; X64-NEXT: retq
+ ret i32 20
+}
+
+define i64 @test_ret_i64() {
+; X32_GISEL-LABEL: test_ret_i64:
+; X32_GISEL: # BB#0:
+; X32_GISEL-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X32_GISEL-NEXT: movl $15, %edx
+; X32_GISEL-NEXT: retl
+;
+; X32_ISEL-LABEL: test_ret_i64:
+; X32_ISEL: # BB#0:
+; X32_ISEL-NEXT: movl $-1, %eax
+; X32_ISEL-NEXT: movl $15, %edx
+; X32_ISEL-NEXT: retl
+;
+; X64-LABEL: test_ret_i64:
+; X64: # BB#0:
+; X64-NEXT: movabsq $68719476735, %rax # imm = 0xFFFFFFFFF
+; X64-NEXT: retq
+ ret i64 68719476735
+}
+
+define i32 @test_arg_i32(i32 %a) {
+; X32_GISEL-LABEL: test_arg_i32:
+; X32_GISEL: # BB#0:
+; X32_GISEL-NEXT: leal 4(%esp), %eax
+; X32_GISEL-NEXT: movl (%eax), %eax
+; X32_GISEL-NEXT: retl
+;
+; X32_ISEL-LABEL: test_arg_i32:
+; X32_ISEL: # BB#0:
+; X32_ISEL-NEXT: movl 4(%esp), %eax
+; X32_ISEL-NEXT: retl
+;
+; X64-LABEL: test_arg_i32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ ret i32 %a
+}
+
+define i64 @test_arg_i64(i64 %a) {
+; X32_GISEL-LABEL: test_arg_i64:
+; X32_GISEL: # BB#0:
+; X32_GISEL-NEXT: leal 4(%esp), %eax
+; X32_GISEL-NEXT: movl (%eax), %eax
+; X32_GISEL-NEXT: leal 8(%esp), %ecx
+; X32_GISEL-NEXT: movl (%ecx), %edx
+; X32_GISEL-NEXT: retl
+;
+; X32_ISEL-LABEL: test_arg_i64:
+; X32_ISEL: # BB#0:
+; X32_ISEL-NEXT: movl 4(%esp), %eax
+; X32_ISEL-NEXT: movl 8(%esp), %edx
+; X32_ISEL-NEXT: retl
+;
+; X64-LABEL: test_arg_i64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ ret i64 %a
+}
+
+define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) {
+; X32_GISEL-LABEL: test_i64_args_8:
+; X32_GISEL: # BB#0:
+; X32_GISEL-NEXT: leal 60(%esp), %eax
+; X32_GISEL-NEXT: movl (%eax), %eax
+; X32_GISEL-NEXT: leal 64(%esp), %ecx
+; X32_GISEL-NEXT: movl (%ecx), %edx
+; X32_GISEL-NEXT: retl
+;
+; X32_ISEL-LABEL: test_i64_args_8:
+; X32_ISEL: # BB#0:
+; X32_ISEL-NEXT: movl 60(%esp), %eax
+; X32_ISEL-NEXT: movl 64(%esp), %edx
+; X32_ISEL-NEXT: retl
+;
+; X64_GISEL-LABEL: test_i64_args_8:
+; X64_GISEL: # BB#0:
+; X64_GISEL-NEXT: leaq 16(%rsp), %rax
+; X64_GISEL-NEXT: movq (%rax), %rax
+; X64_GISEL-NEXT: retq
+;
+; X64_ISEL-LABEL: test_i64_args_8:
+; X64_ISEL: # BB#0:
+; X64_ISEL-NEXT: movq 16(%rsp), %rax
+; X64_ISEL-NEXT: retq
+
+ ret i64 %arg8
+}
+
+define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) {
+; X32-LABEL: test_v4i32_args:
+; X32: # BB#0:
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_v4i32_args:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ ret <4 x i32> %arg2
+}
+
+define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
+; X32-LABEL: test_v8i32_args:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_v8i32_args:
+; X64: # BB#0:
+; X64-NEXT: retq
+
+ ret <8 x i32> %arg1
+}
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
index 616cb70652bb1..8ea3e4f9d739a 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
@@ -207,24 +207,15 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4,
; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]]
; X32-NEXT: [[ARG8H:%[0-9]+]](s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0)
-; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
-; X32-NEXT: [[ARG1_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG1L]](s32), 0
-; X32-NEXT: [[ARG1_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](s32), 32
-; X32-NEXT: [[ARG1:%[0-9]+]](s64) = COPY [[ARG1_TMP1]]
- ; ... a bunch more that we don't track ...
- ; X32: IMPLICIT_DEF
- ; X32: IMPLICIT_DEF
- ; X32: IMPLICIT_DEF
- ; X32: IMPLICIT_DEF
- ; X32: IMPLICIT_DEF
-; X32: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
-; X32-NEXT: [[ARG7_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG7L]](s32), 0
-; X32-NEXT: [[ARG7_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG7_TMP0]], [[ARG7H]](s32), 32
-; X32-NEXT: [[ARG7:%[0-9]+]](s64) = COPY [[ARG7_TMP1]]
-; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
-; X32-NEXT: [[ARG8_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG8L]](s32), 0
-; X32-NEXT: [[ARG8_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG8_TMP0]], [[ARG8H]](s32), 32
-; X32-NEXT: [[ARG8:%[0-9]+]](s64) = COPY [[ARG8_TMP1]]
+; X32-NEXT: [[ARG1:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG1L]](s32), [[ARG1H]](s32)
+; ... a bunch more that we don't track ...
+; X32-NEXT: G_MERGE_VALUES
+; X32-NEXT: G_MERGE_VALUES
+; X32-NEXT: G_MERGE_VALUES
+; X32-NEXT: G_MERGE_VALUES
+; X32-NEXT: G_MERGE_VALUES
+; X32-NEXT: [[ARG7:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG7L]](s32), [[ARG7H]](s32)
+; X32-NEXT: [[ARG8:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG8L]](s32), [[ARG8H]](s32)
; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit
; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit
@@ -236,8 +227,7 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4,
; X64-NEXT: %rax = COPY [[ARG1]](s64)
; X64-NEXT: RET 0, implicit %rax
-; X32-NEXT: [[RETL:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 0
-; X32-NEXT: [[RETH:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 32
+; X32-NEXT: [[RETL:%[0-9]+]](s32), [[RETH:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](s64)
; X32-NEXT: %eax = COPY [[RETL:%[0-9]+]](s32)
; X32-NEXT: %edx = COPY [[RETH:%[0-9]+]](s32)
; X32-NEXT: RET 0, implicit %eax, implicit %edx
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
index e2d938550aea0..90a05f5fc225c 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
@@ -15,12 +15,8 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
; X64: liveins: %xmm0, %xmm1
; X64: [[ARG1L:%[0-9]+]](<4 x s32>) = COPY %xmm0
; X64-NEXT: [[ARG1H:%[0-9]+]](<4 x s32>) = COPY %xmm1
-; X64-NEXT: [[UNDEF:%[0-9]+]](<8 x s32>) = IMPLICIT_DEF
-; X64-NEXT: [[ARG1_TMP0:%[0-9]+]](<8 x s32>) = G_INSERT [[UNDEF]], [[ARG1L]](<4 x s32>), 0
-; X64-NEXT: [[ARG1_TMP1:%[0-9]+]](<8 x s32>) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](<4 x s32>), 128
-; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = COPY [[ARG1_TMP1]]
-; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 0
-; X64-NEXT: [[RETH:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 128
+; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = G_MERGE_VALUES [[ARG1L]](<4 x s32>), [[ARG1H]](<4 x s32>)
+; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>), [[RETH:%[0-9]+]](<4 x s32>) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](<8 x s32>)
; X64-NEXT: %xmm0 = COPY [[RETL:%[0-9]+]](<4 x s32>)
; X64-NEXT: %xmm1 = COPY [[RETH:%[0-9]+]](<4 x s32>)
; X64-NEXT: RET 0, implicit %xmm0, implicit %xmm1
diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll
index 6fe66436e4a8a..f793e36026b1a 100644
--- a/test/CodeGen/X86/GlobalISel/memop.ll
+++ b/test/CodeGen/X86/GlobalISel/memop.ll
@@ -65,7 +65,7 @@ define double @test_load_double(double * %p1) {
; SSE-LABEL: test_load_double:
; SSE: # BB#0:
; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_load_double:
@@ -160,7 +160,7 @@ define double * @test_store_double(double %val, double * %p1) {
;
; SSE_FAST-LABEL: test_store_double:
; SSE_FAST: # BB#0:
-; SSE_FAST-NEXT: movd %xmm0, %rax
+; SSE_FAST-NEXT: movq %xmm0, %rax
; SSE_FAST-NEXT: movq %rax, (%rdi)
; SSE_FAST-NEXT: movq %rdi, %rax
; SSE_FAST-NEXT: retq
diff --git a/test/CodeGen/X86/asm-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll
index 47accdbc07b33..ced074015acef 100644
--- a/test/CodeGen/X86/asm-reg-type-mismatch.ll
+++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll
@@ -27,5 +27,5 @@ entry:
ret i64 %0
; CHECK: test2
; CHECK: movq {{.*}}, %xmm7
- ; CHECK: movd %xmm7, %rax
+ ; CHECK: movq %xmm7, %rax
}
diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll
index 17b73ecf4e1c7..1f25c71a9f762 100644
--- a/test/CodeGen/X86/atomic-non-integer.ll
+++ b/test/CodeGen/X86/atomic-non-integer.ll
@@ -26,7 +26,7 @@ define void @store_float(float* %fptr, float %v) {
define void @store_double(double* %fptr, double %v) {
; CHECK-LABEL: @store_double
-; CHECK: movd %xmm0, %rax
+; CHECK: movq %xmm0, %rax
; CHECK: movq %rax, (%rdi)
store atomic double %v, double* %fptr unordered, align 8
ret void
@@ -59,7 +59,7 @@ define float @load_float(float* %fptr) {
define double @load_double(double* %fptr) {
; CHECK-LABEL: @load_double
; CHECK: movq (%rdi), %rax
-; CHECK: movd %rax, %xmm0
+; CHECK: movq %rax, %xmm0
%v = load atomic double, double* %fptr unordered, align 8
ret double %v
}
@@ -85,7 +85,7 @@ define void @store_float_seq_cst(float* %fptr, float %v) {
define void @store_double_seq_cst(double* %fptr, double %v) {
; CHECK-LABEL: @store_double_seq_cst
-; CHECK: movd %xmm0, %rax
+; CHECK: movq %xmm0, %rax
; CHECK: xchgq %rax, (%rdi)
store atomic double %v, double* %fptr seq_cst, align 8
ret void
@@ -102,7 +102,7 @@ define float @load_float_seq_cst(float* %fptr) {
define double @load_double_seq_cst(double* %fptr) {
; CHECK-LABEL: @load_double_seq_cst
; CHECK: movq (%rdi), %rax
-; CHECK: movd %rax, %xmm0
+; CHECK: movq %rax, %xmm0
%v = load atomic double, double* %fptr seq_cst, align 8
ret double %v
}
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
new file mode 100644
index 0000000000000..052cacfea4dc0
--- /dev/null
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -0,0 +1,2840 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_addpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fadd <4 x double> %a0, %a1
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fadd <4 x double> %1, %2
+ ret <4 x double> %3
+}
+
+define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_addps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fadd <8 x float> %a0, %a1
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = fadd <8 x float> %1, %2
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_addsubpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addsubpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addsubpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_addsubps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addsubps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addsubps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_andnotpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_andnotpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andnotpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %4 = and <4 x i64> %3, %2
+ %5 = load <4 x double>, <4 x double> *%a2, align 32
+ %6 = bitcast <4 x double> %5 to <4 x i64>
+ %7 = xor <4 x i64> %4, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %8 = and <4 x i64> %6, %7
+ %9 = bitcast <4 x i64> %8 to <4 x double>
+ %10 = fadd <4 x double> %a1, %9
+ ret <4 x double> %10
+}
+
+define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_andnotps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_andnotps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andnotps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <8 x float> %a0 to <4 x i64>
+ %2 = bitcast <8 x float> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %4 = and <4 x i64> %3, %2
+ %5 = load <8 x float>, <8 x float> *%a2, align 32
+ %6 = bitcast <8 x float> %5 to <4 x i64>
+ %7 = xor <4 x i64> %4, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %8 = and <4 x i64> %6, %7
+ %9 = bitcast <4 x i64> %8 to <8 x float>
+ %10 = fadd <8 x float> %a1, %9
+ ret <8 x float> %10
+}
+
+define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_andpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_andpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = and <4 x i64> %1, %2
+ %4 = load <4 x double>, <4 x double> *%a2, align 32
+ %5 = bitcast <4 x double> %4 to <4 x i64>
+ %6 = and <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <4 x double>
+ %8 = fadd <4 x double> %a1, %7
+ ret <4 x double> %8
+}
+
+define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_andps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_andps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <8 x float> %a0 to <4 x i64>
+ %2 = bitcast <8 x float> %a1 to <4 x i64>
+ %3 = and <4 x i64> %1, %2
+ %4 = load <8 x float>, <8 x float> *%a2, align 32
+ %5 = bitcast <8 x float> %4 to <4 x i64>
+ %6 = and <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <8 x float>
+ %8 = fadd <8 x float> %a1, %7
+ ret <8 x float> %8
+}
+
+define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_blendpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fadd <4 x double> %a1, %1
+ %4 = shufflevector <4 x double> %3, <4 x double> %2, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_blendps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
+; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
+; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
+; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; SANDY-LABEL: test_blendvpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendvpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendvpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ %2 = load <4 x double>, <4 x double> *%a3, align 32
+ %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; SANDY-LABEL: test_blendvps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendvps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendvps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ %2 = load <8 x float>, <8 x float> *%a3, align 32
+ %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
+; SANDY-LABEL: test_broadcastf128:
+; SANDY: # BB#0:
+; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_broadcastf128:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_broadcastf128:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_broadcastf128:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load <4 x float>, <4 x float> *%a0, align 32
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %2
+}
+
+define <4 x double> @test_broadcastsd_ymm(double *%a0) {
+; SANDY-LABEL: test_broadcastsd_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_broadcastsd_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_broadcastsd_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_broadcastsd_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load double, double *%a0, align 8
+ %2 = insertelement <4 x double> undef, double %1, i32 0
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %3
+}
+
+define <4 x float> @test_broadcastss(float *%a0) {
+; SANDY-LABEL: test_broadcastss:
+; SANDY: # BB#0:
+; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_broadcastss:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_broadcastss:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_broadcastss:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load float, float *%a0, align 4
+ %2 = insertelement <4 x float> undef, float %1, i32 0
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %3
+}
+
+define <8 x float> @test_broadcastss_ymm(float *%a0) {
+; SANDY-LABEL: test_broadcastss_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_broadcastss_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_broadcastss_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_broadcastss_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load float, float *%a0, align 4
+ %2 = insertelement <8 x float> undef, float %1, i32 0
+ %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_cmppd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cmppd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cmppd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BTVER2-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmppd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fcmp oeq <4 x double> %a0, %a1
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fcmp oeq <4 x double> %a0, %2
+ %4 = sext <4 x i1> %1 to <4 x i64>
+ %5 = sext <4 x i1> %3 to <4 x i64>
+ %6 = or <4 x i64> %4, %5
+ %7 = bitcast <4 x i64> %6 to <4 x double>
+ ret <4 x double> %7
+}
+
+define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_cmpps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cmpps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cmpps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BTVER2-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fcmp oeq <8 x float> %a0, %a1
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = fcmp oeq <8 x float> %a0, %2
+ %4 = sext <8 x i1> %1 to <8 x i32>
+ %5 = sext <8 x i1> %3 to <8 x i32>
+ %6 = or <8 x i32> %4, %5
+ %7 = bitcast <8 x i32> %6 to <8 x float>
+ ret <8 x float> %7
+}
+
+define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
+; SANDY-LABEL: test_cvtdq2pd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cvtdq2pd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cvtdq2pd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2pd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = sitofp <4 x i32> %a0 to <4 x double>
+ %2 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %3 = sitofp <4 x i32> %2 to <4 x double>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
+; SANDY-LABEL: test_cvtdq2ps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cvtdq2ps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cvtdq2ps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2ps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = sitofp <8 x i32> %a0 to <8 x float>
+ %2 = load <8 x i32>, <8 x i32> *%a1, align 16
+ %3 = sitofp <8 x i32> %2 to <8 x float>
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+
+define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_cvtpd2dq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cvtpd2dq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cvtpd2dq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fptosi <4 x double> %a0 to <4 x i32>
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = fptosi <4 x double> %2 to <4 x i32>
+ %4 = shufflevector <4 x i32> %1, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %4
+}
+
+define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_cvtpd2ps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cvtpd2ps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cvtpd2ps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2ps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fptrunc <4 x double> %a0 to <4 x float>
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = fptrunc <4 x double> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %1, <4 x float> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %4
+}
+
+define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_cvtps2dq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_cvtps2dq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cvtps2dq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fptosi <8 x float> %a0 to <8 x i32>
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = fptosi <8 x float> %2 to <8 x i32>
+ %4 = or <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+
+define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_divpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_divpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
+; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_divpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
+; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
+; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fdiv <4 x double> %a0, %a1
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fdiv <4 x double> %1, %2
+ ret <4 x double> %3
+}
+
+define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_divps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_divps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_divps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
+; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
+; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fdiv <8 x float> %a0, %a1
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = fdiv <8 x float> %1, %2
+ ret <8 x float> %3
+}
+
+define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_dpps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_dpps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_dpps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dpps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x float> *%a2) {
+; SANDY-LABEL: test_extractf128:
+; SANDY: # BB#0:
+; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_extractf128:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_extractf128:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extractf128:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ store <4 x float> %2, <4 x float> *%a2
+ ret <4 x float> %1
+}
+
+define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_haddpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_haddpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_haddpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_haddps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_haddps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_haddps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_hsubpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_hsubpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_hsubpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_hsubps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_hsubps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_hsubps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; SANDY-LABEL: test_insertf128:
+; SANDY: # BB#0:
+; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
+; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_insertf128:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_insertf128:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
+; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertf128:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
+; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ %3 = load <4 x float>, <4 x float> *%a2, align 16
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = shufflevector <8 x float> %a0, <8 x float> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ %6 = fadd <8 x float> %2, %5
+ ret <8 x float> %6
+}
+
+define <32 x i8> @test_lddqu(i8* %a0) {
+; SANDY-LABEL: test_lddqu:
+; SANDY: # BB#0:
+; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_lddqu:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lddqu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lddqu:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0)
+ ret <32 x i8> %1
+}
+declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
+
+define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
+; SANDY-LABEL: test_maskmovpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maskmovpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
+; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maskmovpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1)
+ call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2)
+ ret <2 x double> %1
+}
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
+
+define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
+; SANDY-LABEL: test_maskmovpd_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maskmovpd_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
+; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maskmovpd_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovpd_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
+ call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2)
+ ret <4 x double> %1
+}
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readonly
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind
+
+define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
+; SANDY-LABEL: test_maskmovps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maskmovps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
+; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maskmovps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1)
+ call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2)
+ ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
+
+define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
+; SANDY-LABEL: test_maskmovps_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maskmovps_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
+; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maskmovps_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovps_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
+ call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
+ ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readonly
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
+
+define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_maxpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maxpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maxpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_maxps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_maxps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_maxps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_minpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_minpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_minpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_minps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_minps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_minps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_movapd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movapd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movapd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movapd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load <4 x double>, <4 x double> *%a0, align 32
+ %2 = fadd <4 x double> %1, %1
+ store <4 x double> %2, <4 x double> *%a1, align 32
+ ret <4 x double> %2
+}
+
+define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_movaps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movaps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movaps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movaps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load <8 x float>, <8 x float> *%a0, align 32
+ %2 = fadd <8 x float> %1, %1
+ store <8 x float> %2, <8 x float> *%a1, align 32
+ ret <8 x float> %2
+}
+
+define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_movddup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movddup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movddup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
+; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movddup:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
+; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define i32 @test_movmskpd(<4 x double> %a0) {
+; SANDY-LABEL: test_movmskpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movmskpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movmskpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+ ret i32 %1
+}
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i32 @test_movmskps(<8 x float> %a0) {
+; SANDY-LABEL: test_movmskps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movmskps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movmskps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+ ret i32 %1
+}
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_movntpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movntpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movntpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fadd <4 x double> %a0, %a0
+ store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0
+ ret <4 x double> %1
+}
+
+define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_movntps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movntps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movntps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fadd <8 x float> %a0, %a0
+ store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0
+ ret <8 x float> %1
+}
+
+define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_movshdup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movshdup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movshdup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
+; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movshdup:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
+; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+
+define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_movsldup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movsldup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movsldup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
+; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsldup:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
+; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+
+define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_movupd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movupd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movupd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movupd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load <4 x double>, <4 x double> *%a0, align 1
+ %2 = fadd <4 x double> %1, %1
+ store <4 x double> %2, <4 x double> *%a1, align 1
+ ret <4 x double> %2
+}
+
+define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_movups:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movups:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movups:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movups:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = load <8 x float>, <8 x float> *%a0, align 1
+ %2 = fadd <8 x float> %1, %1
+ store <8 x float> %2, <8 x float> *%a1, align 1
+ ret <8 x float> %2
+}
+
+define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_mulpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_mulpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_mulpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fmul <4 x double> %a0, %a1
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fmul <4 x double> %1, %2
+ ret <4 x double> %3
+}
+
+define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_mulps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_mulps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_mulps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fmul <8 x float> %a0, %a1
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = fmul <8 x float> %1, %2
+ ret <8 x float> %3
+}
+
+define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: orpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: orpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: orpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: orpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = or <4 x i64> %1, %2
+ %4 = load <4 x double>, <4 x double> *%a2, align 32
+ %5 = bitcast <4 x double> %4 to <4 x i64>
+ %6 = or <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <4 x double>
+ %8 = fadd <4 x double> %a1, %7
+ ret <4 x double> %8
+}
+
+define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_orps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_orps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_orps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <8 x float> %a0 to <4 x i64>
+ %2 = bitcast <8 x float> %a1 to <4 x i64>
+ %3 = or <4 x i64> %1, %2
+ %4 = load <8 x float>, <8 x float> *%a2, align 32
+ %5 = bitcast <8 x float> %4 to <4 x i64>
+ %6 = or <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <8 x float>
+ %8 = fadd <8 x float> %a1, %7
+ ret <8 x float> %8
+}
+
+define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
+; SANDY-LABEL: test_permilpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
+; BTVER2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
+; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %2 = load <2 x double>, <2 x double> *%a1, align 16
+ %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %4 = fadd <2 x double> %1, %3
+ ret <2 x double> %4
+}
+
+define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_permilpd_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilpd_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilpd_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
+; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilpd_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
+; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
+; SANDY-LABEL: test_permilps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; BTVER2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
+; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
+
+define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_permilps_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilps_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilps_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
+; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilps_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
+; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+
+define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; SANDY-LABEL: test_permilvarpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilvarpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilvarpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilvarpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
+
+define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; SANDY-LABEL: test_permilvarpd_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilvarpd_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilvarpd_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilvarpd_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2)
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
+
+define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; SANDY-LABEL: test_permilvarps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilvarps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilvarps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilvarps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
+
+define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; SANDY-LABEL: test_permilvarps_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_permilvarps_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_permilvarps_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_permilvarps_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2)
+ ret <8 x float> %3
+}
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
+
+define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_rcpps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_rcpps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_rcpps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcpps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
+; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2)
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_roundpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7)
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_roundps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
+; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7)
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_rsqrtps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_rsqrtps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_rsqrtps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
+; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rsqrtps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
+; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2)
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_shufpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_shufpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_shufpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
+; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
+; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; SANDY-LABEL: test_shufps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_shufps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_shufps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
+; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
+; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12>
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
+; SANDY-LABEL: test_sqrtpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_sqrtpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
+; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_sqrtpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
+; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
+; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2)
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
+; SANDY-LABEL: test_sqrtps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_sqrtps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
+; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_sqrtps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
+; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
+; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2)
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_subpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_subpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_subpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fsub <4 x double> %a0, %a1
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = fsub <4 x double> %1, %2
+ ret <4 x double> %3
+}
+
+define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_subps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_subps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_subps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = fsub <8 x float> %a0, %a1
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = fsub <8 x float> %1, %2
+ ret <8 x float> %3
+}
+
+define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; SANDY-LABEL: test_testpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: setb %al # sched: [1:0.33]
+; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_testpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_testpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: setb %al # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_testpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: setb %al # sched: [1:0.50]
+; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_testpd_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: setb %al # sched: [1:0.33]
+; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_testpd_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_testpd_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: setb %al # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_testpd_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: setb %al # sched: [1:0.50]
+; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; SANDY-LABEL: test_testps:
+; SANDY: # BB#0:
+; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: setb %al # sched: [1:0.33]
+; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_testps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_testps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: setb %al # sched: [1:0.50]
+; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_testps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: setb %al # sched: [1:0.50]
+; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_testps_ymm:
+; SANDY: # BB#0:
+; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: setb %al # sched: [1:0.33]
+; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_testps_ymm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_testps_ymm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: setb %al # sched: [1:0.50]
+; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_testps_ymm:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: setb %al # sched: [1:0.50]
+; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_unpckhpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_unpckhpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_unpckhpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
+; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
+; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; SANDY-LABEL: test_unpckhps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_unpckhps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_unpckhps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
+; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
+; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_unpcklpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_unpcklpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_unpcklpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
+; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; SANDY-LABEL: test_unpcklps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_unpcklps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_unpcklps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
+; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
+; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %3
+}
+
+define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; SANDY-LABEL: test_xorpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_xorpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_xorpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, %2
+ %4 = load <4 x double>, <4 x double> *%a2, align 32
+ %5 = bitcast <4 x double> %4 to <4 x i64>
+ %6 = xor <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <4 x double>
+ %8 = fadd <4 x double> %a1, %7
+ ret <4 x double> %8
+}
+
+define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; SANDY-LABEL: test_xorps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_xorps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_xorps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ %1 = bitcast <8 x float> %a0 to <4 x i64>
+ %2 = bitcast <8 x float> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, %2
+ %4 = load <8 x float>, <8 x float> *%a2, align 32
+ %5 = bitcast <8 x float> %4 to <4 x i64>
+ %6 = xor <4 x i64> %3, %5
+ %7 = bitcast <4 x i64> %6 to <8 x float>
+ %8 = fadd <8 x float> %a1, %7
+ ret <8 x float> %8
+}
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll
index 12aa863a37a15..b75db95869c27 100644
--- a/test/CodeGen/X86/bitcast2.ll
+++ b/test/CodeGen/X86/bitcast2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movd | count 2
+; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movq | count 2
; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp
define i64 @test1(double %A) {
diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll
index d0967c1021492..1b69b5542556a 100644
--- a/test/CodeGen/X86/bool-ext-inc.ll
+++ b/test/CodeGen/X86/bool-ext-inc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s
; FIXME: add (sext i1 X), 1 -> zext (not i1 X)
@@ -20,13 +20,93 @@ define i32 @sext_inc(i1 zeroext %x) nounwind {
define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
; CHECK-LABEL: sext_inc_vec:
; CHECK: # BB#0:
-; CHECK-NEXT: pslld $31, %xmm0
-; CHECK-NEXT: psrad $31, %xmm0
-; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%ext = sext <4 x i1> %x to <4 x i32>
%add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %add
}
+define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: cmpgt_sext_inc_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %cmp = icmp sgt <4 x i32> %x, %y
+ %ext = sext <4 x i1> %cmp to <4 x i32>
+ %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: cmpne_sext_inc_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %cmp = icmp ne <4 x i32> %x, %y
+ %ext = sext <4 x i1> %cmp to <4 x i32>
+ %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: cmpgt_sext_inc_vec256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %cmp = icmp sgt <4 x i64> %x, %y
+ %ext = sext <4 x i1> %cmp to <4 x i64>
+ %add = add <4 x i64> %ext, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %add
+}
+
+define i32 @bool_logic_and_math(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+; CHECK-LABEL: bool_logic_and_math:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: cmpl %ecx, %edx
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: andb %al, %cl
+; CHECK-NEXT: movzbl %cl, %ecx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp1 = icmp ne i32 %a, %b
+ %cmp2 = icmp ne i32 %c, %d
+ %and = and i1 %cmp1, %cmp2
+ %ext = sext i1 %and to i32
+ %add = add i32 %ext, 1
+ ret i32 %add
+}
+
+define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
+; CHECK-LABEL: bool_logic_and_math_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT: vpxor %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %cmp1 = icmp ne <4 x i32> %a, %b
+ %cmp2 = icmp ne <4 x i32> %c, %d
+ %and = and <4 x i1> %cmp1, %cmp2
+ %ext = sext <4 x i1> %and to <4 x i32>
+ %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index c425e3a92d173..ae0f4406ba0d2 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -928,7 +928,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rcx
; SSE-NEXT: movq %rcx, %r8
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rcx, %r10
@@ -938,7 +938,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; SSE-NEXT: movq %rcx, %rdi
; SSE-NEXT: andb $15, %cl
; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movd %xmm1, %rcx
+; SSE-NEXT: movq %xmm1, %rcx
; SSE-NEXT: shrq $56, %rdi
; SSE-NEXT: andb $15, %dil
; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
@@ -1106,7 +1106,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rcx
; SSE-NEXT: movq %rcx, %r8
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rcx, %r10
@@ -1116,7 +1116,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; SSE-NEXT: movq %rcx, %rdi
; SSE-NEXT: andb $15, %cl
; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movd %xmm2, %rcx
+; SSE-NEXT: movq %xmm2, %rcx
; SSE-NEXT: shrq $56, %rdi
; SSE-NEXT: andb $15, %dil
; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 44c4510c89e1a..706e89051a3da 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -223,18 +223,17 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr0:
; SSE: # BB#0:
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: psrlq $48, %xmm1
+; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
; AVX: # BB#0:
-; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index 71f6c3e633342..e1e849929405a 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -76,6 +76,53 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
ret <4 x i32> %1
}
+define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: combine_vec_udiv_by_pow2c:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrld %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psrld %xmm2, %xmm4
+; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrld %xmm1, %xmm2
+; SSE-NEXT: psrld %xmm3, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_vec_udiv_by_pow2c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_udiv_by_pow2c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
+ %2 = udiv <4 x i32> %x, %1
+ ret <4 x i32> %2
+}
+
; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_udiv_by_shl_pow2a:
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index f412e9ca6312b..91da268a8d75a 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -64,6 +64,99 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) {
ret <4 x i32> %1
}
+define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: combine_vec_urem_by_pow2c:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $23, %xmm1
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_vec_urem_by_pow2c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_urem_by_pow2c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
+ %2 = urem <4 x i32> %x, %1
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: combine_vec_urem_by_pow2d:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrld %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: psrld %xmm2, %xmm5
+; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: psrld %xmm1, %xmm2
+; SSE-NEXT: psrld %xmm4, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_vec_urem_by_pow2d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_urem_by_pow2d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y
+ %2 = urem <4 x i32> %x, %1
+ ret <4 x i32> %2
+}
+
; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_shl_pow2a:
diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll
new file mode 100644
index 0000000000000..83589b7706f75
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-bfi.ll
@@ -0,0 +1,115 @@
+; RUN: opt -consthoist -mtriple=x86_64-unknown-linux-gnu -consthoist-with-block-frequency=true -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check when BFI is enabled for constant hoisting, constant 214748364701
+; will not be hoisted to the func entry.
+; CHECK-LABEL: @foo(
+; CHECK: entry:
+; CHECK-NOT: bitcast i64 214748364701 to i64
+; CHECK: if.then:
+
+; Function Attrs: norecurse nounwind uwtable
+define i64 @foo(i64* nocapture %a) {
+entry:
+ %arrayidx = getelementptr inbounds i64, i64* %a, i64 9
+ %t0 = load i64, i64* %arrayidx, align 8
+ %cmp = icmp slt i64 %t0, 564
+ br i1 %cmp, label %if.then, label %if.else5
+
+if.then: ; preds = %entry
+ %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5
+ %t1 = load i64, i64* %arrayidx1, align 8
+ %cmp2 = icmp slt i64 %t1, 1009
+ br i1 %cmp2, label %if.then3, label %return
+
+if.then3: ; preds = %if.then
+ %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 6
+ %t2 = load i64, i64* %arrayidx4, align 8
+ %inc = add nsw i64 %t2, 1
+ store i64 %inc, i64* %arrayidx4, align 8
+ br label %return
+
+if.else5: ; preds = %entry
+ %arrayidx6 = getelementptr inbounds i64, i64* %a, i64 6
+ %t3 = load i64, i64* %arrayidx6, align 8
+ %cmp7 = icmp slt i64 %t3, 3512
+ br i1 %cmp7, label %if.then8, label %return
+
+if.then8: ; preds = %if.else5
+ %arrayidx9 = getelementptr inbounds i64, i64* %a, i64 7
+ %t4 = load i64, i64* %arrayidx9, align 8
+ %inc10 = add nsw i64 %t4, 1
+ store i64 %inc10, i64* %arrayidx9, align 8
+ br label %return
+
+return: ; preds = %if.else5, %if.then, %if.then8, %if.then3
+ %retval.0 = phi i64 [ 214748364701, %if.then3 ], [ 214748364701, %if.then8 ], [ 250148364702, %if.then ], [ 256148364704, %if.else5 ]
+ ret i64 %retval.0
+}
+
+; Check when BFI is enabled for constant hoisting, constant 214748364701
+; in while.body will be hoisted to while.body.preheader. 214748364701 in
+; if.then16 and if.else10 will be merged and hoisted to the beginning of
+; if.else10 because if.else10 dominates if.then16.
+; CHECK-LABEL: @goo(
+; CHECK: entry:
+; CHECK-NOT: bitcast i64 214748364701 to i64
+; CHECK: while.body.preheader:
+; CHECK-NEXT: bitcast i64 214748364701 to i64
+; CHECK-NOT: bitcast i64 214748364701 to i64
+; CHECK: if.else10:
+; CHECK-NEXT: bitcast i64 214748364701 to i64
+; CHECK-NOT: bitcast i64 214748364701 to i64
+define i64 @goo(i64* nocapture %a) {
+entry:
+ %arrayidx = getelementptr inbounds i64, i64* %a, i64 9
+ %t0 = load i64, i64* %arrayidx, align 8
+ %cmp = icmp ult i64 %t0, 56
+ br i1 %cmp, label %if.then, label %if.else10, !prof !0
+
+if.then: ; preds = %entry
+ %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5
+ %t1 = load i64, i64* %arrayidx1, align 8
+ %cmp2 = icmp ult i64 %t1, 10
+ br i1 %cmp2, label %while.cond.preheader, label %return, !prof !0
+
+while.cond.preheader: ; preds = %if.then
+ %arrayidx7 = getelementptr inbounds i64, i64* %a, i64 6
+ %t2 = load i64, i64* %arrayidx7, align 8
+ %cmp823 = icmp ugt i64 %t2, 10000
+ br i1 %cmp823, label %while.body.preheader, label %return
+
+while.body.preheader: ; preds = %while.cond.preheader
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %t3 = phi i64 [ %add, %while.body ], [ %t2, %while.body.preheader ]
+ %add = add i64 %t3, 214748364701
+ %cmp8 = icmp ugt i64 %add, 10000
+ br i1 %cmp8, label %while.body, label %while.cond.return.loopexit_crit_edge
+
+if.else10: ; preds = %entry
+ %arrayidx11 = getelementptr inbounds i64, i64* %a, i64 6
+ %t4 = load i64, i64* %arrayidx11, align 8
+ %add2 = add i64 %t4, 214748364701
+ %cmp12 = icmp ult i64 %add2, 35
+ br i1 %cmp12, label %if.then16, label %return, !prof !0
+
+if.then16: ; preds = %if.else10
+ %arrayidx17 = getelementptr inbounds i64, i64* %a, i64 7
+ %t5 = load i64, i64* %arrayidx17, align 8
+ %inc = add i64 %t5, 1
+ store i64 %inc, i64* %arrayidx17, align 8
+ br label %return
+
+while.cond.return.loopexit_crit_edge: ; preds = %while.body
+ store i64 %add, i64* %arrayidx7, align 8
+ br label %return
+
+return: ; preds = %while.cond.preheader, %while.cond.return.loopexit_crit_edge, %if.else10, %if.then, %if.then16
+ %retval.0 = phi i64 [ 214748364701, %if.then16 ], [ 0, %if.then ], [ 0, %if.else10 ], [ 0, %while.cond.return.loopexit_crit_edge ], [ 0, %while.cond.preheader ]
+ ret i64 %retval.0
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index a283bcc6d460c..726e30fce63b3 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -30,7 +30,7 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
; X64-NEXT: shlq $32, %rcx
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: orq %rcx, %rax
-; X64-NEXT: movd %rax, %xmm0
+; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X64-NEXT: movd %xmm0, %eax
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
index 612807dd8123e..c2111f672a2e3 100644
--- a/test/CodeGen/X86/dwarf-headers.ll
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -1,16 +1,16 @@
-; RUN: llc -split-dwarf=Disable -dwarf-version=4 -generate-type-units \
+; RUN: llc -dwarf-version=4 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-4
-; RUN: llc -split-dwarf=Enable -dwarf-version=4 -generate-type-units \
+; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=4 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-4
-; RUN: llc -split-dwarf=Disable -dwarf-version=5 -generate-type-units \
+; RUN: llc -dwarf-version=5 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-5
-; RUN: llc -split-dwarf=Enable -dwarf-version=5 -generate-type-units \
+; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=5 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-5
diff --git a/test/CodeGen/X86/eh-frame-unreachable.ll b/test/CodeGen/X86/eh-frame-unreachable.ll
new file mode 100644
index 0000000000000..a7abc8a057fb9
--- /dev/null
+++ b/test/CodeGen/X86/eh-frame-unreachable.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; Test that we don't emit a row that extends beyond the FDE's range_size.
+;
+; CHECK: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_endproc
+; CHECK-NOT: .cfi
+
+define void @f() #0 {
+ unreachable
+}
+attributes #0 = { "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/empty-function.ll b/test/CodeGen/X86/empty-function.ll
new file mode 100644
index 0000000000000..92bebd0ab1a7c
--- /dev/null
+++ b/test/CodeGen/X86/empty-function.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=LINUX %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+; Don't emit empty functions on Windows; it can lead to duplicate entries
+; (multiple functions sharing the same RVA) in the Guard CF Function Table which
+; the kernel refuses to load.
+
+define void @f() {
+entry:
+ unreachable
+
+; CHECK-LABEL: f:
+; WIN32: nop
+; WIN64: ud2
+; LINUX-NOT: nop
+; LINUX-NOT: ud2
+
+}
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index 735df2a4196d5..0c139534e567d 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -23,8 +23,6 @@ entry:
; CHECK-FP-NEXT: :
; CHECK-FP-NEXT: .cfi_offset %rbp, -16
; CHECK-FP-NEXT: movq %rsp, %rbp
-; CHECK-FP-NEXT: :
-; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
; CHECK-FP-NEXT: .cfi_endproc
; An empty function is perfectly fine on ELF.
@@ -35,9 +33,7 @@ entry:
; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func
; LINUX-NO-FP-NEXT: .cfi_endproc
-; A cfi directive can point to the end of a function. It (and in fact the
-; entire body) could be optimized out because of the unreachable, but we
-; don't do it right now.
+; A cfi directive cannot point to the end of a function.
; LINUX-FP: func:
; LINUX-FP-NEXT: .cfi_startproc
; LINUX-FP-NEXT: {{^}}#
@@ -48,7 +44,5 @@ entry:
; LINUX-FP-NEXT: .cfi_offset %rbp, -16
; LINUX-FP-NEXT: movq %rsp, %rbp
; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
-; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp
-; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
; LINUX-FP-NEXT: .size func, .Lfunc_end0-func
; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index e36e33ffe66b7..228ce70b40097 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -320,7 +320,7 @@ define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind {
define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v2i64_0:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v2i64_0:
@@ -335,7 +335,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v2i64_1:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v2i64_1:
@@ -355,7 +355,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_1:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v4i64_1:
@@ -376,7 +376,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_3:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v4i64_3:
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index 5c481197c3b49..d68236e9d250e 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -7,7 +7,6 @@ target triple = "i386--netbsd"
; CHECK-LABEL: fn1
; CHECK: addl {{.*#+}} 4-byte Folded Reload
-; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: retl
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index f7d4eb380d574..c3109673468ec 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -11,7 +11,7 @@
; LIN: movdqa (%rsi), %xmm0
; LIN: pand (%rdx), %xmm0
; LIN: pextrq $1, %xmm0, %r[[REG4:.+]]
-; LIN: movd %xmm0, %r[[REG2:.+]]
+; LIN: movq %xmm0, %r[[REG2:.+]]
; LIN: movslq %e[[REG2]], %r[[REG1:.+]]
; LIN: sarq $32, %r[[REG2]]
; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
@@ -24,7 +24,7 @@
; WIN: movdqa (%rdx), %xmm0
; WIN: pand (%r8), %xmm0
; WIN: pextrq $1, %xmm0, %r[[REG4:.+]]
-; WIN: movd %xmm0, %r[[REG2:.+]]
+; WIN: movq %xmm0, %r[[REG2:.+]]
; WIN: movslq %e[[REG2]], %r[[REG1:.+]]
; WIN: sarq $32, %r[[REG2]]
; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll
index a745f652d0653..7b2656897e0e8 100644
--- a/test/CodeGen/X86/i256-add.ll
+++ b/test/CodeGen/X86/i256-add.ll
@@ -12,34 +12,35 @@ define void @add(i256* %p, i256* %q) nounwind {
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 8(%ecx), %edx
-; X32-NEXT: movl (%ecx), %ebx
-; X32-NEXT: movl 4(%ecx), %edi
+; X32-NEXT: movl 8(%ecx), %edi
+; X32-NEXT: movl (%ecx), %edx
+; X32-NEXT: movl 4(%ecx), %ebx
; X32-NEXT: movl 28(%eax), %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl 24(%eax), %ebp
-; X32-NEXT: addl (%eax), %ebx
-; X32-NEXT: adcl 4(%eax), %edi
-; X32-NEXT: adcl 8(%eax), %edx
+; X32-NEXT: addl (%eax), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %esi
+; X32-NEXT: adcl 4(%eax), %ebx
+; X32-NEXT: adcl 8(%eax), %edi
+; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT: movl 20(%eax), %edi
; X32-NEXT: movl 12(%eax), %edx
-; X32-NEXT: movl 16(%eax), %eax
+; X32-NEXT: movl 16(%eax), %esi
; X32-NEXT: adcl 12(%ecx), %edx
-; X32-NEXT: adcl 16(%ecx), %eax
-; X32-NEXT: adcl 20(%ecx), %esi
-; X32-NEXT: adcl 24(%ecx), %ebp
-; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT: adcl 16(%ecx), %esi
+; X32-NEXT: adcl 20(%ecx), %edi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: adcl 24(%ecx), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
; X32-NEXT: adcl %ebp, 28(%ecx)
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, 8(%ecx)
+; X32-NEXT: movl %ebx, 4(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, (%ecx)
-; X32-NEXT: movl %edi, 4(%ecx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 8(%ecx)
; X32-NEXT: movl %edx, 12(%ecx)
-; X32-NEXT: movl %eax, 16(%ecx)
-; X32-NEXT: movl %esi, 20(%ecx)
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl %esi, 16(%ecx)
+; X32-NEXT: movl %edi, 20(%ecx)
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: addl $12, %esp
; X32-NEXT: popl %esi
@@ -58,9 +59,9 @@ define void @add(i256* %p, i256* %q) nounwind {
; X64-NEXT: adcq 8(%rsi), %rdx
; X64-NEXT: adcq 16(%rsi), %rax
; X64-NEXT: adcq %r8, 24(%rdi)
-; X64-NEXT: movq %rcx, (%rdi)
-; X64-NEXT: movq %rdx, 8(%rdi)
; X64-NEXT: movq %rax, 16(%rdi)
+; X64-NEXT: movq %rdx, 8(%rdi)
+; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%a = load i256, i256* %p
%b = load i256, i256* %q
@@ -96,9 +97,9 @@ define void @sub(i256* %p, i256* %q) nounwind {
; X32-NEXT: sbbl 24(%esi), %eax
; X32-NEXT: movl 28(%esi), %esi
; X32-NEXT: sbbl %esi, 28(%ecx)
-; X32-NEXT: movl %ebx, (%ecx)
-; X32-NEXT: movl %ebp, 4(%ecx)
; X32-NEXT: movl %edi, 8(%ecx)
+; X32-NEXT: movl %ebp, 4(%ecx)
+; X32-NEXT: movl %ebx, (%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
; X32-NEXT: movl %esi, 12(%ecx)
; X32-NEXT: movl (%esp), %esi # 4-byte Reload
@@ -122,9 +123,9 @@ define void @sub(i256* %p, i256* %q) nounwind {
; X64-NEXT: sbbq 8(%rsi), %rdx
; X64-NEXT: sbbq 16(%rsi), %rax
; X64-NEXT: sbbq %r8, 24(%rdi)
-; X64-NEXT: movq %rcx, (%rdi)
-; X64-NEXT: movq %rdx, 8(%rdi)
; X64-NEXT: movq %rax, 16(%rdi)
+; X64-NEXT: movq %rdx, 8(%rdi)
+; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%a = load i256, i256* %p
%b = load i256, i256* %q
diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll
index 3da1a360e2904..f2fbff1431213 100644
--- a/test/CodeGen/X86/i64-to-float.ll
+++ b/test/CodeGen/X86/i64-to-float.ll
@@ -251,11 +251,11 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X64-SSE-NEXT: pandn %xmm3, %xmm0
; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm1
; X64-SSE-NEXT: por %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %rax
+; X64-SSE-NEXT: movq %xmm1, %rax
; X64-SSE-NEXT: xorps %xmm0, %xmm0
; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X64-SSE-NEXT: movd %xmm1, %rax
+; X64-SSE-NEXT: movq %xmm1, %rax
; X64-SSE-NEXT: xorps %xmm1, %xmm1
; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm1
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/test/CodeGen/X86/insertelement-duplicates.ll b/test/CodeGen/X86/insertelement-duplicates.ll
new file mode 100644
index 0000000000000..b07343362144a
--- /dev/null
+++ b/test/CodeGen/X86/insertelement-duplicates.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64
+
+define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline {
+; SSE-32-LABEL: PR15298:
+; SSE-32: # BB#0: # %L.entry
+; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT: movaps 304(%ecx), %xmm0
+; SSE-32-NEXT: xorps %xmm1, %xmm1
+; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
+; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-32-NEXT: movups %xmm1, 624(%eax)
+; SSE-32-NEXT: movups %xmm0, 608(%eax)
+; SSE-32-NEXT: retl
+;
+; SSE-64-LABEL: PR15298:
+; SSE-64: # BB#0: # %L.entry
+; SSE-64-NEXT: movaps 304(%rdi), %xmm0
+; SSE-64-NEXT: xorps %xmm1, %xmm1
+; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
+; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-64-NEXT: movups %xmm1, 624(%rsi)
+; SSE-64-NEXT: movups %xmm0, 608(%rsi)
+; SSE-64-NEXT: retq
+;
+; AVX-32-LABEL: PR15298:
+; AVX-32: # BB#0: # %L.entry
+; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
+; AVX-32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
+; AVX-32-NEXT: vzeroupper
+; AVX-32-NEXT: retl
+;
+; AVX-64-LABEL: PR15298:
+; AVX-64: # BB#0: # %L.entry
+; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
+; AVX-64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
+; AVX-64-NEXT: vzeroupper
+; AVX-64-NEXT: retq
+L.entry:
+ %0 = getelementptr inbounds <4 x float>, <4 x float>* %source, i32 19
+ %1 = load <4 x float>, <4 x float>* %0, align 16
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %2, i32 2
+ %4 = insertelement <8 x float> %3, float %2, i32 1
+ %5 = getelementptr <8 x float>, <8 x float>* %dest, i32 19
+ store <8 x float> %4, <8 x float>* %5, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll
index ea38d9e4ec296..89e5f9481188e 100644
--- a/test/CodeGen/X86/isint.ll
+++ b/test/CodeGen/X86/isint.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK64 %s
; PR19059
-; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK32 %s
define i32 @isint_return(double %d) nounwind {
; CHECK-LABEL: isint_return:
@@ -15,7 +14,8 @@ define i32 @isint_return(double %d) nounwind {
%c = fcmp oeq double %d, %e
; CHECK32-NOT: movd {{.*}}, %r{{.*}}
; CHECK32-NOT: andq
-; CHECK-NEXT: movd
+; CHECK32-NEXT: movd
+; CHECK64-NEXT: movq
; CHECK-NEXT: andl
%z = zext i1 %c to i32
ret i32 %z
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index 62020c2d19146..79f90f49c7c6b 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -44,16 +44,16 @@ define double @test2(double %A, double %B) {
define i64 @test3(i64 %A) {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
-; CHECK-NEXT: movd %rdi, %xmm0
+; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movd %xmm0, %rax
+; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test3:
; CHECK-WIDE: # BB#0:
-; CHECK-WIDE-NEXT: movd %rdi, %xmm0
+; CHECK-WIDE-NEXT: movq %rdi, %xmm0
; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: movd %xmm0, %rax
+; CHECK-WIDE-NEXT: movq %xmm0, %rax
; CHECK-WIDE-NEXT: retq
%1 = bitcast i64 %A to <2 x float>
%add = fadd <2 x float> %1, <float 3.0, float 5.0>
@@ -67,18 +67,18 @@ define i64 @test3(i64 %A) {
define i64 @test4(i64 %A) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
-; CHECK-NEXT: movd %rdi, %xmm0
+; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: movd %xmm0, %rax
+; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test4:
; CHECK-WIDE: # BB#0:
-; CHECK-WIDE-NEXT: movd %rdi, %xmm0
+; CHECK-WIDE-NEXT: movq %rdi, %xmm0
; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: movd %xmm0, %rax
+; CHECK-WIDE-NEXT: movq %xmm0, %rax
; CHECK-WIDE-NEXT: retq
%1 = bitcast i64 %A to <2 x i32>
%add = add <2 x i32> %1, <i32 3, i32 5>
diff --git a/test/CodeGen/X86/memcpy-struct-by-value.ll b/test/CodeGen/X86/memcpy-struct-by-value.ll
new file mode 100644
index 0000000000000..2e7a64d84000d
--- /dev/null
+++ b/test/CodeGen/X86/memcpy-struct-by-value.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32
+; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+; FIXME: The documentation states that ivybridge has ermsb, but this is not
+; enabled right now since I could not confirm by testing.
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
+
+%struct.large = type { [4096 x i8] }
+
+declare void @foo(%struct.large* align 8 byval) nounwind
+
+define void @test1(%struct.large* nocapture %x) nounwind {
+ call void @foo(%struct.large* align 8 byval %x)
+ ret void
+
+; ALL-LABEL: test1:
+; NOFAST: rep;movsq
+; NOFAST32: rep;movsl
+; FAST: rep;movsb
+}
+
+define void @test2(%struct.large* nocapture %x) nounwind minsize {
+ call void @foo(%struct.large* align 8 byval %x)
+ ret void
+
+; ALL-LABEL: test2:
+; NOFAST: rep;movsq
+; NOFAST32: rep;movsl
+; FAST: rep;movsb
+}
+
+%struct.large_oddsize = type { [4095 x i8] }
+
+declare void @foo_oddsize(%struct.large_oddsize* align 8 byval) nounwind
+
+define void @test3(%struct.large_oddsize* nocapture %x) nounwind minsize {
+ call void @foo_oddsize(%struct.large_oddsize* align 8 byval %x)
+ ret void
+
+; ALL-LABEL: test3:
+; NOFAST: rep;movsb
+; NOFAST32: rep;movsb
+; FAST: rep;movsb
+}
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index dcb7bd010e56b..f4c4c6d360676 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -29,17 +29,8 @@ entry:
ret void
}
-
-
;; CHECK-LABEL: indexed-store-merge
-
-;; We should be able to merge the 4 consecutive stores.
-;; FIXMECHECK: movl $0, 2(%rsi,%rdi)
-
-;; CHECK: movb $0, 2(%rsi,%rdi)
-;; CHECK: movb $0, 3(%rsi,%rdi)
-;; CHECK: movb $0, 4(%rsi,%rdi)
-;; CHECK: movb $0, 5(%rsi,%rdi)
+;; CHECK: movl $0, 2(%rsi,%rdi)
;; CHECK: movb $0, (%rsi)
define void @indexed-store-merge(i64 %p, i8* %v) {
entry:
diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll
index 9128e5cb4c9de..30cf474dc38b7 100644
--- a/test/CodeGen/X86/mmx-bitcast.ll
+++ b/test/CodeGen/X86/mmx-bitcast.ll
@@ -80,7 +80,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: movd %edi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: movd %xmm1, %rax
+; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: retq
%v0 = insertelement <2 x i32> undef, i32 %a, i32 0
%v1 = insertelement <2 x i32> %v0, i32 %b, i32 1
diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll
index 8f2da95353993..fd6c5081b5a35 100644
--- a/test/CodeGen/X86/mmx-cvt.ll
+++ b/test/CodeGen/X86/mmx-cvt.ll
@@ -347,7 +347,7 @@ define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movd %mm0, %rax
-; X64-NEXT: movd %rax, %xmm0
+; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%2 = bitcast <1 x i64>* %0 to x86_mmx*
diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll
index 4fdee11ec83a1..ae28fab9bb629 100644
--- a/test/CodeGen/X86/mod128.ll
+++ b/test/CodeGen/X86/mod128.ll
@@ -18,7 +18,7 @@ define i64 @mod128(i128 %x) {
; WIN64-DAG: movq $0, 40(%rsp)
; WIN64-DAG: movq $3, 32(%rsp)
; WIN64: callq __modti3
- ; WIN64: movd %xmm0, %rax
+ ; WIN64: movq %xmm0, %rax
%1 = srem i128 %x, 3
%2 = trunc i128 %1 to i64
diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll
index 1caa22a15947e..e40f64eb39b21 100644
--- a/test/CodeGen/X86/movmsk.ll
+++ b/test/CodeGen/X86/movmsk.ll
@@ -100,7 +100,7 @@ entry:
define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: movd %xmm0, %rdi
+; CHECK-NEXT: movq %xmm0, %rdi
; CHECK-NEXT: shrq $63, %rdi
; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<kill>
; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index d1bb8d3e923b6..337e625df1683 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -596,14 +596,14 @@ define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
; SSE2-LABEL: test_extract_i64:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movntiq %rax, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_i64:
; SSE4A: # BB#0:
; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE4A-NEXT: movd %xmm0, %rax
+; SSE4A-NEXT: movq %xmm0, %rax
; SSE4A-NEXT: movntiq %rax, (%rdi)
; SSE4A-NEXT: retq
;
diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir
new file mode 100644
index 0000000000000..ba5c85922c7ab
--- /dev/null
+++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir
@@ -0,0 +1,322 @@
+# RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=btver2 -run-pass=post-RA-sched -o - %s | FileCheck %s
+
+# Test that multiple DBG_VALUE's following an instruction whose register needs
+# to be changed during the post-RA scheduler pass are updated correctly.
+
+# Test case was derived from the output from the following command and
+# the source code below:
+#
+# clang -S -emit-llvm -target x86_64 -march=btver2 -O2 -g -o - <srcfile> |
+# llc -stop-before=post-RA-sched -o -
+#
+# Source code reduced from the original 8MB source file:
+#
+# struct a;
+# class b {
+# public:
+# a *c = ap;
+# unsigned *d() { return (unsigned *)c; }
+# a *ap;
+# };
+# enum { e = 2 };
+# template <typename f> f *g(f *h, f *i) {
+# long j = long(i), k = -!h;
+# return reinterpret_cast<f *>(long(h) | k & j);
+# }
+# class l {
+# public:
+# l(int);
+# int m;
+# };
+# unsigned *n;
+# unsigned o;
+# class p {
+# public:
+# int aa();
+# unsigned *q() {
+# n = r.d();
+# return g(n, &o);
+# }
+# b r;
+# };
+# class s : l {
+# public:
+# p t;
+# s(int h) : l(h), ab(t), ac(~0 << h) { ae(); }
+# p &ab;
+# int ac;
+# void ae() {
+# const unsigned *v;
+# const unsigned u = 0;
+# v = ab.q();
+# const unsigned *x = g(v, &u);
+# int w = x[m] & ac;
+# while (w) {
+# int z = (ab.aa() - 1) / e;
+# if (m <= z)
+# return;
+# }
+# }
+# };
+# class ad {
+# public:
+# ~ad() {
+# for (y();;)
+# ;
+# }
+# class y {
+# public:
+# y() : af(0) {}
+# s af;
+# };
+# };
+# class ag {
+# ad ah;
+# };
+# enum ai {};
+# class aj {
+# public:
+# aj(unsigned(ai));
+# ag ak;
+# };
+# struct al {
+# static unsigned am(ai);
+# };
+# template <int> struct an : al { static aj ao; };
+# template <> aj an<0>::ao(am);
+
+--- |
+
+ %class.s = type <{ %class.l, [4 x i8], %class.p, %class.p*, i32, [4 x i8] }>
+ %class.l = type { i32 }
+ %class.p = type { %class.b }
+ %class.b = type { %struct.a*, %struct.a* }
+ %struct.a = type opaque
+
+ @n = local_unnamed_addr global i32* null, align 8
+ @o = global i32 0, align 4
+
+ define linkonce_odr void @_ZN1sC2Ei(%class.s*, i32) unnamed_addr #0 align 2 !dbg !4 {
+ %3 = alloca i32, align 4
+ %4 = bitcast %class.s* %0 to %class.l*
+ tail call void @_ZN1lC2Ei(%class.l* %4, i32 %1)
+ %5 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2
+ tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !10, metadata !17), !dbg !18
+ tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !20, metadata !17), !dbg !27
+ %6 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2, i32 0, i32 1
+ %7 = bitcast %struct.a** %6 to i64*
+ %8 = load i64, i64* %7, align 8
+ %9 = bitcast %class.p* %5 to i64*
+ store i64 %8, i64* %9, align 8
+ %10 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 3
+ store %class.p* %5, %class.p** %10, align 8
+ %11 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 4
+ %12 = shl i32 -1, %1
+ store i32 %12, i32* %11, align 8
+ store i32 0, i32* %3, align 4
+ %13 = bitcast %class.p* %5 to i32**
+ %14 = load i32*, i32** %13, align 8
+ store i32* %14, i32** @n, align 8
+ %15 = icmp eq i32* %14, null
+ %16 = ptrtoint i32* %14 to i64
+ %17 = select i1 %15, i64 ptrtoint (i32* @o to i64), i64 0
+ %18 = or i64 %17, %16
+ tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !29, metadata !35), !dbg !36
+ tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !39, metadata !17), !dbg !44
+ %19 = ptrtoint i32* %3 to i64
+ call void @llvm.dbg.value(metadata i64 %19, i64 0, metadata !46, metadata !17), !dbg !48
+ %20 = icmp eq i64 %18, 0
+ %21 = select i1 %20, i64 %19, i64 0
+ %22 = or i64 %21, %18
+ %23 = inttoptr i64 %22 to i32*
+ %24 = bitcast %class.s* %0 to i32*
+ %25 = load i32, i32* %24, align 8
+ %26 = sext i32 %25 to i64
+ %27 = getelementptr inbounds i32, i32* %23, i64 %26
+ %28 = load i32, i32* %27, align 4
+ %29 = and i32 %12, %28
+ %30 = icmp eq i32 %29, 0
+ br i1 %30, label %47, label %31
+
+ ; <label>:31: ; preds = %2
+ %32 = bitcast %class.s* %0 to i32*
+ %33 = call i32 @_ZN1p2aaEv(%class.p* %5)
+ %34 = add nsw i32 %33, -1
+ %35 = sdiv i32 %34, 2
+ %36 = load i32, i32* %32, align 8
+ %37 = icmp sgt i32 %36, %35
+ br i1 %37, label %38, label %47
+
+ ; <label>:38: ; preds = %31
+ br label %39
+
+ ; <label>:39: ; preds = %39, %38
+ %40 = bitcast %class.s* %0 to i32*
+ %sunkaddr = ptrtoint %class.s* %0 to i64
+ %sunkaddr1 = add i64 %sunkaddr, 24
+ %sunkaddr2 = inttoptr i64 %sunkaddr1 to %class.p**
+ %41 = load %class.p*, %class.p** %sunkaddr2, align 8
+ %42 = call i32 @_ZN1p2aaEv(%class.p* %41)
+ %43 = add nsw i32 %42, -1
+ %44 = sdiv i32 %43, 2
+ %45 = load i32, i32* %40, align 8
+ %46 = icmp sgt i32 %45, %44
+ br i1 %46, label %39, label %47
+
+ ; <label>:47: ; preds = %39, %31, %2
+ ret void
+ }
+
+ declare void @_ZN1lC2Ei(%class.l*, i32) unnamed_addr #1
+
+ declare i32 @_ZN1p2aaEv(%class.p*) local_unnamed_addr #1
+
+ ; Function Attrs: nounwind readnone
+ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+ !llvm.dbg.cu = !{!0}
+ !llvm.module.flags = !{!2, !3}
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+ !1 = !DIFile(filename: "test.cpp", directory: "")
+ !2 = !{i32 2, !"Dwarf Version", i32 4}
+ !3 = !{i32 2, !"Debug Info Version", i32 3}
+ !4 = distinct !DISubprogram(name: "s", linkageName: "_ZN1sC2Ei", scope: !5, file: !1, line: 32, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+ !5 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "s", file: !1, line: 29, size: 320, identifier: "_ZTS1s")
+ !6 = !DISubroutineType(types: !7)
+ !7 = !{null, !8, !9}
+ !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+ !10 = !DILocalVariable(name: "this", arg: 1, scope: !11, type: !16, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !11 = distinct !DISubprogram(name: "p", linkageName: "_ZN1pC2Ev", scope: !12, file: !1, line: 20, type: !13, isLocal: false, isDefinition: true, scopeLine: 20, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: true, unit: !0)
+ !12 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "p", file: !1, line: 20, size: 128, identifier: "_ZTS1p")
+ !13 = !DISubroutineType(types: !14)
+ !14 = !{null, !15}
+ !15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+ !17 = !DIExpression()
+ !18 = !DILocation(line: 0, scope: !11, inlinedAt: !19)
+ !19 = distinct !DILocation(line: 32, column: 3, scope: !4)
+ !20 = !DILocalVariable(name: "this", arg: 1, scope: !21, type: !26, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !21 = distinct !DISubprogram(name: "b", linkageName: "_ZN1bC2Ev", scope: !22, file: !1, line: 2, type: !23, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: true, unit: !0)
+ !22 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "b", file: !1, line: 2, size: 128, identifier: "_ZTS1b")
+ !23 = !DISubroutineType(types: !24)
+ !24 = !{null, !25}
+ !25 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64)
+ !27 = !DILocation(line: 0, scope: !21, inlinedAt: !28)
+ !28 = distinct !DILocation(line: 20, column: 7, scope: !11, inlinedAt: !19)
+ !29 = !DILocalVariable(name: "u", scope: !30, file: !1, line: 37, type: !33)
+ !30 = distinct !DISubprogram(name: "ae", linkageName: "_ZN1s2aeEv", scope: !5, file: !1, line: 35, type: !31, isLocal: false, isDefinition: true, scopeLine: 35, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+ !31 = !DISubroutineType(types: !32)
+ !32 = !{null, !8}
+ !33 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !34)
+ !34 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+ !35 = !DIExpression(DW_OP_deref)
+ !36 = !DILocation(line: 37, column: 20, scope: !30, inlinedAt: !37)
+ !37 = distinct !DILocation(line: 32, column: 41, scope: !38)
+ !38 = distinct !DILexicalBlock(scope: !4, file: !1, line: 32, column: 39)
+ !39 = !DILocalVariable(name: "i", arg: 2, scope: !40, file: !1, line: 9, type: !43)
+ !40 = distinct !DISubprogram(name: "g<const unsigned int>", linkageName: "_Z1gIKjEPT_S2_S2_", scope: !1, file: !1, line: 9, type: !41, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+ !41 = !DISubroutineType(types: !42)
+ !42 = !{!43, !43, !43}
+ !43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !33, size: 64)
+ !44 = !DILocation(line: 9, column: 37, scope: !40, inlinedAt: !45)
+ !45 = distinct !DILocation(line: 39, column: 25, scope: !30, inlinedAt: !37)
+ !46 = !DILocalVariable(name: "j", scope: !40, file: !1, line: 10, type: !47)
+ !47 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+ !48 = !DILocation(line: 10, column: 8, scope: !40, inlinedAt: !45)
+
+...
+---
+name: _ZN1sC2Ei
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+fixedStack:
+ - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '%rbx' }
+ - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%r14' }
+ - { id: 2, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+stack:
+ - { id: 0, offset: -36, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ successors: %bb.3, %bb.2
+ liveins: %esi, %rdi, %r14, %rbx, %rbp
+
+ ; CHECK: [[REGISTER:%r[a-z0-9]+]] = LEA64r {{%r[a-z0-9]+}}, 1, _, -20, _
+ ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !46, !17, debug-location !48
+ ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !39, !17, debug-location !44
+
+ frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+ CFI_INSTRUCTION def_cfa_offset 16
+ CFI_INSTRUCTION offset %rbp, -16
+ %rbp = frame-setup MOV64rr %rsp
+ CFI_INSTRUCTION def_cfa_register %rbp
+ frame-setup PUSH64r killed %r14, implicit-def %rsp, implicit %rsp
+ frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+ %rsp = frame-setup SUB64ri8 %rsp, 16, implicit-def dead %eflags
+ CFI_INSTRUCTION offset %rbx, -32
+ CFI_INSTRUCTION offset %r14, -24
+ %r14d = MOV32rr %esi
+ %rbx = MOV64rr %rdi
+ CALL64pcrel32 @_ZN1lC2Ei, csr_64, implicit %rsp, implicit %rdi, implicit %esi, implicit-def %rsp
+ %rdi = LEA64r %rbx, 1, _, 8, _
+ DBG_VALUE debug-use %rdi, debug-use _, !20, !17, debug-location !27
+ DBG_VALUE debug-use %rdi, debug-use _, !10, !17, debug-location !18
+ %rax = MOV64rm %rbx, 1, _, 16, _ :: (load 8)
+ MOV64mr %rbx, 1, _, 8, _, killed %rax :: (store 8)
+ MOV64mr %rbx, 1, _, 24, _, %rdi :: (store 8)
+ %eax = MOV32ri -1
+ %cl = MOV8rr %r14b, implicit killed %r14d
+ %eax = SHL32rCL killed %eax, implicit-def dead %eflags, implicit %cl
+ MOV32mr %rbx, 1, _, 32, _, %eax :: (store 4, align 8)
+ MOV32mi %rbp, 1, _, -20, _, 0 :: (store 4)
+ %rcx = MOV64rm %rbx, 1, _, 8, _ :: (load 8)
+ MOV64mr %rip, 1, _, @n, _, %rcx :: (store 8)
+ %edx = XOR32rr undef %edx, undef %edx, implicit-def dead %eflags, implicit-def %rdx
+ TEST64rr %rcx, %rcx, implicit-def %eflags
+ %esi = MOV32ri @o, implicit-def %rsi
+ %rsi = CMOVNE64rr killed %rsi, %rdx, implicit killed %eflags
+ %rsi = OR64rr killed %rsi, killed %rcx, implicit-def %eflags
+ %rcx = LEA64r %rbp, 1, _, -20, _
+ DBG_VALUE debug-use %rcx, debug-use _, !46, !17, debug-location !48
+ DBG_VALUE debug-use %rcx, debug-use _, !39, !17, debug-location !44
+ DBG_VALUE %rbp, -20, !29, !17, debug-location !36
+ %rcx = CMOVNE64rr killed %rcx, killed %rdx, implicit killed %eflags
+ %rcx = OR64rr killed %rcx, killed %rsi, implicit-def dead %eflags
+ %rdx = MOVSX64rm32 %rbx, 1, _, 0, _ :: (load 4, align 8)
+ TEST32rm killed %eax, killed %rcx, 4, killed %rdx, 0, _, implicit-def %eflags :: (load 4)
+ JNE_1 %bb.2, implicit %eflags
+ JMP_1 %bb.3
+
+ bb.1:
+ successors: %bb.2
+ liveins: %rbx, %rbp
+
+ %rdi = MOV64rm %rbx, 1, _, 24, _ :: (load 8)
+
+ bb.2:
+ successors: %bb.1, %bb.3
+ liveins: %rbx, %rbp, %rsp, %rdi
+
+ CALL64pcrel32 @_ZN1p2aaEv, csr_64, implicit %rsp, implicit %rdi, implicit-def %rsp, implicit-def %eax
+ %eax = KILL %eax, implicit-def %rax
+ %ecx = LEA64_32r %rax, 1, _, -1, _, implicit-def %rcx
+ %ecx = SHR32ri %ecx, 31, implicit-def dead %eflags, implicit killed %rcx, implicit-def %rcx
+ %eax = LEA64_32r killed %rax, 1, killed %rcx, -1, _
+ %eax = SAR32r1 killed %eax, implicit-def dead %eflags
+ CMP32mr %rbx, 1, _, 0, _, killed %eax, implicit-def %eflags :: (load 4, align 8), (load 4, align 8)
+ JG_1 %bb.1, implicit killed %eflags
+
+ bb.3:
+ liveins: %rbp
+
+ %rsp = ADD64ri8 %rsp, 16, implicit-def dead %eflags
+ %rbx = POP64r implicit-def %rsp, implicit %rsp
+ %r14 = POP64r implicit-def %rsp, implicit %rsp
+ %rbp = POP64r implicit-def %rsp, implicit %rsp
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/pr14657.ll b/test/CodeGen/X86/pr14657.ll
new file mode 100644
index 0000000000000..cc7d3e068d4aa
--- /dev/null
+++ b/test/CodeGen/X86/pr14657.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; PR14657 - avoid truncation/extension of comparison results
+
+@da = common global [1024 x float] zeroinitializer, align 32
+@db = common global [1024 x float] zeroinitializer, align 32
+@dc = common global [1024 x float] zeroinitializer, align 32
+@dd = common global [1024 x float] zeroinitializer, align 32
+@dj = common global [1024 x i32] zeroinitializer, align 32
+
+define void @_Z9example25v() nounwind uwtable noinline ssp {
+; SSE2-LABEL: _Z9example25v:
+; SSE2: # BB#0: # %vector.ph
+; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB0_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movaps da+4096(%rax), %xmm1
+; SSE2-NEXT: movaps da+4112(%rax), %xmm2
+; SSE2-NEXT: cmpltps db+4112(%rax), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: cmpltps db+4096(%rax), %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: psllw $15, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movaps dc+4096(%rax), %xmm2
+; SSE2-NEXT: movaps dc+4112(%rax), %xmm3
+; SSE2-NEXT: cmpltps dd+4112(%rax), %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltps dd+4096(%rax), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax)
+; SSE2-NEXT: movdqa %xmm1, dj+4096(%rax)
+; SSE2-NEXT: addq $32, %rax
+; SSE2-NEXT: jne .LBB0_1
+; SSE2-NEXT: # BB#2: # %for.end
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: _Z9example25v:
+; SSE41: # BB#0: # %vector.ph
+; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1]
+; SSE41-NEXT: .p2align 4, 0x90
+; SSE41-NEXT: .LBB0_1: # %vector.body
+; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE41-NEXT: movaps da+4096(%rax), %xmm2
+; SSE41-NEXT: movaps da+4112(%rax), %xmm3
+; SSE41-NEXT: cmpltps db+4112(%rax), %xmm3
+; SSE41-NEXT: pshufb %xmm0, %xmm3
+; SSE41-NEXT: cmpltps db+4096(%rax), %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT: psllw $15, %xmm2
+; SSE41-NEXT: psraw $15, %xmm2
+; SSE41-NEXT: movaps dc+4096(%rax), %xmm3
+; SSE41-NEXT: movaps dc+4112(%rax), %xmm4
+; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm4
+; SSE41-NEXT: pshufb %xmm0, %xmm4
+; SSE41-NEXT: cmpltps dd+4096(%rax), %xmm3
+; SSE41-NEXT: pshufb %xmm0, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE41-NEXT: psllw $15, %xmm3
+; SSE41-NEXT: psraw $15, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE41-NEXT: pand %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, dj+4112(%rax)
+; SSE41-NEXT: movdqa %xmm2, dj+4096(%rax)
+; SSE41-NEXT: addq $32, %rax
+; SSE41-NEXT: jne .LBB0_1
+; SSE41-NEXT: # BB#2: # %for.end
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: _Z9example25v:
+; AVX1: # BB#0: # %vector.ph
+; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB0_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vmovups da+4096(%rax), %ymm1
+; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1
+; AVX1-NEXT: vmovups dc+4096(%rax), %ymm2
+; AVX1-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1
+; AVX1-NEXT: vmovups %ymm1, dj+4096(%rax)
+; AVX1-NEXT: addq $32, %rax
+; AVX1-NEXT: jne .LBB0_1
+; AVX1-NEXT: # BB#2: # %for.end
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _Z9example25v:
+; AVX2: # BB#0: # %vector.ph
+; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm0
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB0_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovups da+4096(%rax), %ymm1
+; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1
+; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2
+; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2
+; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vandps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vmovups %ymm1, dj+4096(%rax)
+; AVX2-NEXT: addq $32, %rax
+; AVX2-NEXT: jne .LBB0_1
+; AVX2-NEXT: # BB#2: # %for.end
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+vector.ph:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %index
+ %1 = bitcast float* %0 to <8 x float>*
+ %2 = load <8 x float>, <8 x float>* %1, align 16
+ %3 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %index
+ %4 = bitcast float* %3 to <8 x float>*
+ %5 = load <8 x float>, <8 x float>* %4, align 16
+ %6 = fcmp olt <8 x float> %2, %5
+ %7 = getelementptr inbounds [1024 x float], [1024 x float]* @dc, i64 0, i64 %index
+ %8 = bitcast float* %7 to <8 x float>*
+ %9 = load <8 x float>, <8 x float>* %8, align 16
+ %10 = getelementptr inbounds [1024 x float], [1024 x float]* @dd, i64 0, i64 %index
+ %11 = bitcast float* %10 to <8 x float>*
+ %12 = load <8 x float>, <8 x float>* %11, align 16
+ %13 = fcmp olt <8 x float> %9, %12
+ %14 = and <8 x i1> %6, %13
+ %15 = zext <8 x i1> %14 to <8 x i32>
+ %16 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %index
+ %17 = bitcast i32* %16 to <8 x i32>*
+ store <8 x i32> %15, <8 x i32>* %17, align 16
+ %index.next = add i64 %index, 8
+ %18 = icmp eq i64 %index.next, 1024
+ br i1 %18, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body
+ ret void
+}
+
+define void @_Z9example24ss(i16 signext %x, i16 signext %y) nounwind uwtable noinline ssp {
+; SSE2-LABEL: _Z9example24ss:
+; SSE2: # BB#0: # %vector.ph
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB1_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movaps da+4096(%rax), %xmm2
+; SSE2-NEXT: movaps da+4112(%rax), %xmm3
+; SSE2-NEXT: cmpltps db+4112(%rax), %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltps db+4096(%rax), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax)
+; SSE2-NEXT: movdqa %xmm3, dj+4096(%rax)
+; SSE2-NEXT: addq $32, %rax
+; SSE2-NEXT: jne .LBB1_1
+; SSE2-NEXT: # BB#2: # %for.end
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: _Z9example24ss:
+; SSE41: # BB#0: # %vector.ph
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE41-NEXT: movd %esi, %xmm1
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: .p2align 4, 0x90
+; SSE41-NEXT: .LBB1_1: # %vector.body
+; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE41-NEXT: movaps da+4096(%rax), %xmm3
+; SSE41-NEXT: movaps da+4112(%rax), %xmm4
+; SSE41-NEXT: cmpltps db+4112(%rax), %xmm4
+; SSE41-NEXT: pshufb %xmm2, %xmm4
+; SSE41-NEXT: cmpltps db+4096(%rax), %xmm3
+; SSE41-NEXT: pshufb %xmm2, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
+; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: por %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
+; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax)
+; SSE41-NEXT: movdqa %xmm4, dj+4112(%rax)
+; SSE41-NEXT: addq $32, %rax
+; SSE41-NEXT: jne .LBB1_1
+; SSE41-NEXT: # BB#2: # %for.end
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: _Z9example24ss:
+; AVX1: # BB#0: # %vector.ph
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vmovd %esi, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB1_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vmovups da+4096(%rax), %ymm2
+; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax)
+; AVX1-NEXT: addq $32, %rax
+; AVX1-NEXT: jne .LBB1_1
+; AVX1-NEXT: # BB#2: # %for.end
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _Z9example24ss:
+; AVX2: # BB#0: # %vector.ph
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vmovd %esi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB1_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovups da+4096(%rax), %ymm2
+; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT: vmovdqu %ymm2, dj+4096(%rax)
+; AVX2-NEXT: addq $32, %rax
+; AVX2-NEXT: jne .LBB1_1
+; AVX2-NEXT: # BB#2: # %for.end
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+vector.ph:
+ %0 = insertelement <8 x i16> undef, i16 %x, i32 0
+ %broadcast11 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %1 = insertelement <8 x i16> undef, i16 %y, i32 0
+ %broadcast12 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %index
+ %3 = bitcast float* %2 to <8 x float>*
+ %4 = load <8 x float>, <8 x float>* %3, align 16
+ %5 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %index
+ %6 = bitcast float* %5 to <8 x float>*
+ %7 = load <8 x float>, <8 x float>* %6, align 16
+ %8 = fcmp olt <8 x float> %4, %7
+ %9 = select <8 x i1> %8, <8 x i16> %broadcast11, <8 x i16> %broadcast12
+ %10 = sext <8 x i16> %9 to <8 x i32>
+ %11 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %index
+ %12 = bitcast i32* %11 to <8 x i32>*
+ store <8 x i32> %10, <8 x i32>* %12, align 16
+ %index.next = add i64 %index, 8
+ %13 = icmp eq i64 %index.next, 1024
+ br i1 %13, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body
+ ret void
+}
diff --git a/test/CodeGen/X86/pr18344.ll b/test/CodeGen/X86/pr18344.ll
index 15bf91031ee88..fcf4174ec3d3b 100644
--- a/test/CodeGen/X86/pr18344.ll
+++ b/test/CodeGen/X86/pr18344.ll
@@ -36,7 +36,7 @@ define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noa
; X64: # BB#0: # %begin
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
-; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: movslq %eax, %r8
; X64-NEXT: sarq $32, %rax
; X64-NEXT: pextrq $1, %xmm0, %rdx
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 54f9cb310dd37..84b7467e6a17f 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -16,7 +16,7 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: pextrq $1, %xmm0, %rdx
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movd %xmm0, %rax
+; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %r9
; CHECK-NEXT: shrq $32, %r9
; CHECK-NEXT: andl $2032, %eax # imm = 0x7F0
diff --git a/test/CodeGen/X86/pr22970.ll b/test/CodeGen/X86/pr22970.ll
new file mode 100644
index 0000000000000..38c063355f647
--- /dev/null
+++ b/test/CodeGen/X86/pr22970.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+define i32 @PR22970_i32(i32* nocapture readonly, i32) {
+; X86-LABEL: PR22970_i32:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $4095, %ecx # imm = 0xFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 32(%eax,%ecx,4), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: PR22970_i32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64-NEXT: andl $4095, %esi # imm = 0xFFF
+; X64-NEXT: movl 32(%rdi,%rsi,4), %eax
+; X64-NEXT: retq
+ %3 = and i32 %1, 4095
+ %4 = add nuw nsw i32 %3, 8
+ %5 = zext i32 %4 to i64
+ %6 = getelementptr inbounds i32, i32* %0, i64 %5
+ %7 = load i32, i32* %6, align 4
+ ret i32 %7
+}
+
+define i32 @PR22970_i64(i32* nocapture readonly, i64) {
+; X86-LABEL: PR22970_i64:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $4095, %ecx # imm = 0xFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 32(%eax,%ecx,4), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: PR22970_i64:
+; X64: # BB#0:
+; X64-NEXT: andl $4095, %esi # imm = 0xFFF
+; X64-NEXT: movl 32(%rdi,%rsi,4), %eax
+; X64-NEXT: retq
+ %3 = and i64 %1, 4095
+ %4 = add nuw nsw i64 %3, 8
+ %5 = getelementptr inbounds i32, i32* %0, i64 %4
+ %6 = load i32, i32* %5, align 4
+ ret i32 %6
+}
diff --git a/test/CodeGen/X86/pr30511.ll b/test/CodeGen/X86/pr30511.ll
index 053ae013b4515..3c512ba270091 100644
--- a/test/CodeGen/X86/pr30511.ll
+++ b/test/CodeGen/X86/pr30511.ll
@@ -11,7 +11,7 @@ define i64 @PR30511(<2 x double> %a) {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movd %xmm0, %rax
+; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
%1 = fadd <2 x double> %a, <double 0x4338000000000000, double 0x4338000000000000>
%2 = bitcast <2 x double> %1 to <2 x i64>
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index d447bf9b9b8cb..178fe3357d433 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -55,7 +55,7 @@ define <16 x i8> @test5(<16 x i8> %V) {
; CHECK-LABEL: test5:
; CHECK: # BB#0:
; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movd %rax, %xmm1
+; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: movdqa %xmm1, (%rax)
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movdqa %xmm1, (%rax)
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 758aa462f5137..65c3ac0cc447f 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -33,7 +33,7 @@ define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
; CHECK: ## BB#0:
; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movd %rax, %xmm0
+; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
diff --git a/test/CodeGen/X86/sad_variations.ll b/test/CodeGen/X86/sad_variations.ll
index 1d826cf41a4d0..04fda5ed87740 100644
--- a/test/CodeGen/X86/sad_variations.ll
+++ b/test/CodeGen/X86/sad_variations.ll
@@ -206,7 +206,7 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
@@ -255,7 +255,7 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
@@ -304,7 +304,7 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
index 2b19d02ba8b57..c99d3494b8ee3 100644
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -536,7 +536,7 @@ define double @u64_to_d(i64 %a) nounwind {
;
; SSE2_64-LABEL: u64_to_d:
; SSE2_64: # BB#0:
-; SSE2_64-NEXT: movd %rdi, %xmm1
+; SSE2_64-NEXT: movq %rdi, %xmm1
; SSE2_64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; SSE2_64-NEXT: subpd {{.*}}(%rip), %xmm1
; SSE2_64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
diff --git a/test/CodeGen/X86/setcc-combine.ll b/test/CodeGen/X86/setcc-combine.ll
index c6ad5e0031edb..38205c660731f 100644
--- a/test/CodeGen/X86/setcc-combine.ll
+++ b/test/CodeGen/X86/setcc-combine.ll
@@ -1,166 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic < %s | FileCheck %s
define i32 @test_eq_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_eq_1:
-; CHECK: pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT: pxor {{.*}}(%rip), %xmm1
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp eq <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_ne_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ne_1:
-; CHECK: pcmpgtd %xmm0, %xmm1
-; CHECK-NOT: pxor
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp ne <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_le_1:
-; CHECK: movl $-1, %eax
-; CHECK-NEXT: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $-1, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sle <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_ge_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ge_1:
-; CHECK: pcmpgtd %xmm0, %xmm1
-; CHECK: pxor {{.*}}(%rip), %xmm1
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sge <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_lt_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_lt_1:
-; CHECK: pcmpgtd %xmm0, %xmm1
-; CHECK-NOT: pxor
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp slt <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_gt_1:
-; CHECK: xorl %eax, %eax
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sgt <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_eq_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_eq_2:
-; CHECK: pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT: pxor {{.*}}(%rip), %xmm0
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm0, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp eq <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_ne_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ne_2:
-; CHECK: pcmpgtd %xmm1, %xmm0
-; CHECK-NOT: pxor
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp ne <4 x i32> %sext, zeroinitializer
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_le_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_le_2:
-; CHECK: pcmpgtd %xmm1, %xmm0
-; CHECK: pxor {{.*}}(%rip), %xmm0
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm0, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sle <4 x i32> zeroinitializer, %sext
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ge_2:
-; CHECK: movl $-1, %eax
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $-1, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sge <4 x i32> zeroinitializer, %sext
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_lt_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_lt_2:
-; CHECK: pcmpgtd %xmm1, %xmm0
-; CHECK-NOT: pxor
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp slt <4 x i32> zeroinitializer, %sext
- %0 = extractelement <4 x i1> %cmp, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
define i32 @test_gt_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_gt_2:
-; CHECK: pcmpgtd %xmm1, %xmm0
-; CHECK-NOT: pxor
-; CHECK: retq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp1 = icmp sgt <4 x i32> zeroinitializer, %sext
- %0 = extractelement <4 x i1> %cmp1, i32 1
- %1 = sext i1 %0 to i32
- ret i32 %1
+ %t0 = extractelement <4 x i1> %cmp1, i32 1
+ %t1 = sext i1 %t0 to i32
+ ret i32 %t1
}
+
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index b4ec03598aa4e..2996edaec3e0e 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -58,17 +58,17 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: ne_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm4, %r9
-; SSE2-NEXT: movd %xmm0, %r10
-; SSE2-NEXT: movd %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %r9
+; SSE2-NEXT: movq %xmm0, %r10
+; SSE2-NEXT: movq %xmm1, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rdi
+; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: movd %xmm2, %rcx
-; SSE2-NEXT: movd %xmm3, %rdx
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: movq %xmm3, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
; SSE2-NEXT: xorq %r10, %rcx
; SSE2-NEXT: orq %rdx, %rcx
@@ -100,17 +100,17 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: eq_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movd %xmm4, %r9
-; SSE2-NEXT: movd %xmm0, %r10
-; SSE2-NEXT: movd %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %r9
+; SSE2-NEXT: movq %xmm0, %r10
+; SSE2-NEXT: movq %xmm1, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rdi
+; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
-; SSE2-NEXT: movd %xmm2, %rcx
-; SSE2-NEXT: movd %xmm3, %rdx
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: movq %xmm3, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
; SSE2-NEXT: xorq %r10, %rcx
; SSE2-NEXT: orq %rdx, %rcx
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 930af226b9535..d5cd8b0525dd5 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -801,7 +801,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000
-; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmuludq %xmm1, %xmm2
@@ -839,7 +839,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
; CHECK-NEXT: psrad $16, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
-; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmuludq %xmm1, %xmm2
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
index dfd9c0b0b3029..54de15c292f60 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -16,7 +16,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
; X64-LABEL: test_mm_cvtsi128_si64:
; X64: # BB#0:
-; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
%res = extractelement <2 x i64> %a0, i32 0
ret i64 %res
@@ -35,7 +35,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
; X64-LABEL: test_mm_cvtsi64_si128:
; X64: # BB#0:
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
%res1 = insertelement <2 x i64> %res0, i64 0, i32 1
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 3071155172e35..964037ea80af8 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2291,8 +2291,8 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
;
; X64-LABEL: test_mm_set_epi64x:
; X64: # BB#0:
-; X64-NEXT: movd %rdi, %xmm1
-; X64-NEXT: movd %rsi, %xmm0
+; X64-NEXT: movq %rdi, %xmm1
+; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <2 x i64> undef, i64 %a1, i32 0
@@ -2433,7 +2433,7 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
;
; X64-LABEL: test_mm_set1_epi64x:
; X64: # BB#0:
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
@@ -2685,8 +2685,8 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
;
; X64-LABEL: test_mm_setr_epi64x:
; X64: # BB#0:
-; X64-NEXT: movd %rsi, %xmm1
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rsi, %xmm1
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
@@ -3249,7 +3249,7 @@ define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
;
; X64-LABEL: test_mm_storel_epi64:
; X64: # BB#0:
-; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
%ext = extractelement <2 x i64> %a1, i32 0
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 33a4f413b6832..14c155c8c6c09 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -1808,32 +1808,32 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_movd_64:
; GENERIC: # BB#0:
-; GENERIC-NEXT: movd %rdi, %xmm1
+; GENERIC-NEXT: movq %rdi, %xmm1
; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; GENERIC-NEXT: paddq %xmm0, %xmm1
; GENERIC-NEXT: paddq %xmm0, %xmm2
-; GENERIC-NEXT: movd %xmm2, %rax
+; GENERIC-NEXT: movq %xmm2, %rax
; GENERIC-NEXT: movq %xmm1, (%rsi)
; GENERIC-NEXT: retq
;
; ATOM-LABEL: test_movd_64:
; ATOM: # BB#0:
; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; ATOM-NEXT: movd %rdi, %xmm2
+; ATOM-NEXT: movq %rdi, %xmm2
; ATOM-NEXT: paddq %xmm0, %xmm2
; ATOM-NEXT: paddq %xmm0, %xmm1
; ATOM-NEXT: movq %xmm2, (%rsi)
-; ATOM-NEXT: movd %xmm1, %rax
+; ATOM-NEXT: movq %xmm1, %rax
; ATOM-NEXT: retq
;
; SLM-LABEL: test_movd_64:
; SLM: # BB#0:
; SLM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [3:1.00]
-; SLM-NEXT: movd %rdi, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movq %rdi, %xmm1 # sched: [1:0.50]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: movq %xmm1, (%rsi) # sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm2 # sched: [1:0.50]
-; SLM-NEXT: movd %xmm2, %rax # sched: [1:0.50]
+; SLM-NEXT: movq %xmm2, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movd_64:
@@ -3545,6 +3545,52 @@ define i16 @test_pextrw(<8 x i16> %a0) {
ret i16 %1
}
+define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
+; GENERIC-LABEL: test_pinsrw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pinsrw $1, %edi, %xmm0
+; GENERIC-NEXT: pinsrw $3, (%rsi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pinsrw:
+; ATOM: # BB#0:
+; ATOM-NEXT: pinsrw $1, %edi, %xmm0
+; ATOM-NEXT: pinsrw $3, (%rsi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pinsrw:
+; SLM: # BB#0:
+; SLM-NEXT: pinsrw $1, %edi, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pinsrw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pinsrw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pinsrw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1
+ %2 = load i16, i16 *%a2
+ %3 = insertelement <8 x i16> %1, i16 %2, i32 3
+ ret <8 x i16> %3
+}
+
define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmaddwd:
; GENERIC: # BB#0:
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
new file mode 100644
index 0000000000000..482b2fcab6425
--- /dev/null
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -0,0 +1,455 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+
+define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_addsubpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: addsubpd %xmm1, %xmm0
+; GENERIC-NEXT: addsubpd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_addsubpd:
+; ATOM: # BB#0:
+; ATOM-NEXT: addsubpd %xmm1, %xmm0
+; ATOM-NEXT: addsubpd (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_addsubpd:
+; SLM: # BB#0:
+; SLM-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: addsubpd (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_addsubpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addsubpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addsubpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_addsubps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: addsubps %xmm1, %xmm0
+; GENERIC-NEXT: addsubps (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_addsubps:
+; ATOM: # BB#0:
+; ATOM-NEXT: addsubps %xmm1, %xmm0
+; ATOM-NEXT: addsubps (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_addsubps:
+; SLM: # BB#0:
+; SLM-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: addsubps (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_addsubps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_addsubps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_addsubps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_haddpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: haddpd %xmm1, %xmm0
+; GENERIC-NEXT: haddpd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_haddpd:
+; ATOM: # BB#0:
+; ATOM-NEXT: haddpd %xmm1, %xmm0
+; ATOM-NEXT: haddpd (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_haddpd:
+; SLM: # BB#0:
+; SLM-NEXT: haddpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: haddpd (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_haddpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_haddpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_haddpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_haddps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: haddps %xmm1, %xmm0
+; GENERIC-NEXT: haddps (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_haddps:
+; ATOM: # BB#0:
+; ATOM-NEXT: haddps %xmm1, %xmm0
+; ATOM-NEXT: haddps (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_haddps:
+; SLM: # BB#0:
+; SLM-NEXT: haddps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: haddps (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_haddps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_haddps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_haddps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_hsubpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: hsubpd %xmm1, %xmm0
+; GENERIC-NEXT: hsubpd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_hsubpd:
+; ATOM: # BB#0:
+; ATOM-NEXT: hsubpd %xmm1, %xmm0
+; ATOM-NEXT: hsubpd (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_hsubpd:
+; SLM: # BB#0:
+; SLM-NEXT: hsubpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: hsubpd (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_hsubpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_hsubpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_hsubpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_hsubps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: hsubps %xmm1, %xmm0
+; GENERIC-NEXT: hsubps (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_hsubps:
+; ATOM: # BB#0:
+; ATOM-NEXT: hsubps %xmm1, %xmm0
+; ATOM-NEXT: hsubps (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_hsubps:
+; SLM: # BB#0:
+; SLM-NEXT: hsubps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: hsubps (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_hsubps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_hsubps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_hsubps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x i8> @test_lddqu(i8* %a0) {
+; GENERIC-LABEL: test_lddqu:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: lddqu (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_lddqu:
+; ATOM: # BB#0:
+; ATOM-NEXT: lddqu (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_lddqu:
+; SLM: # BB#0:
+; SLM-NEXT: lddqu (%rdi), %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lddqu:
+; SANDY: # BB#0:
+; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_lddqu:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lddqu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+ ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+
+define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
+; GENERIC-LABEL: test_movddup:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; GENERIC-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; GENERIC-NEXT: addpd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_movddup:
+; ATOM: # BB#0:
+; ATOM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0]
+; ATOM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; ATOM-NEXT: addpd %xmm0, %xmm1
+; ATOM-NEXT: movapd %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_movddup:
+; SLM: # BB#0:
+; SLM-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
+; SLM-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [3:1.00]
+; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movddup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movddup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movddup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:1.00]
+; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = load <2 x double>, <2 x double> *%a1, align 16
+ %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+ %4 = fadd <2 x double> %1, %3
+ ret <2 x double> %4
+}
+
+define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
+; GENERIC-LABEL: test_movshdup:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; GENERIC-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3]
+; GENERIC-NEXT: addps %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_movshdup:
+; ATOM: # BB#0:
+; ATOM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
+; ATOM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; ATOM-NEXT: addps %xmm0, %xmm1
+; ATOM-NEXT: movaps %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_movshdup:
+; SLM: # BB#0:
+; SLM-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
+; SLM-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [3:1.00]
+; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movshdup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movshdup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movshdup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [5:1.00]
+; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
+
+define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
+; GENERIC-LABEL: test_movsldup:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; GENERIC-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2]
+; GENERIC-NEXT: addps %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_movsldup:
+; ATOM: # BB#0:
+; ATOM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2]
+; ATOM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; ATOM-NEXT: addps %xmm0, %xmm1
+; ATOM-NEXT: movaps %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_movsldup:
+; SLM: # BB#0:
+; SLM-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
+; SLM-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [3:1.00]
+; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movsldup:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movsldup:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movsldup:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [5:1.00]
+; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
+; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
new file mode 100644
index 0000000000000..340b9abe88797
--- /dev/null
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -0,0 +1,1938 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+
+define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_blendpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; GENERIC-NEXT: addpd %xmm1, %xmm0
+; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_blendpd:
+; SLM: # BB#0:
+; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
+; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_blendpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
+; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = fadd <2 x double> %a1, %1
+ %4 = shufflevector <2 x double> %3, <2 x double> %2, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %4
+}
+
+define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_blendps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_blendps:
+; SLM: # BB#0:
+; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
+; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_blendps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
+; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %3
+}
+
+define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_blendvpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movapd %xmm0, %xmm3
+; GENERIC-NEXT: movaps %xmm2, %xmm0
+; GENERIC-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; GENERIC-NEXT: blendvpd %xmm0, (%rdi), %xmm3
+; GENERIC-NEXT: movapd %xmm3, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_blendvpd:
+; SLM: # BB#0:
+; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
+; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
+; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_blendvpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendvpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendvpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ %2 = load <2 x double>, <2 x double> *%a3, align 16
+ %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_blendvps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movaps %xmm0, %xmm3
+; GENERIC-NEXT: movaps %xmm2, %xmm0
+; GENERIC-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; GENERIC-NEXT: blendvps %xmm0, (%rdi), %xmm3
+; GENERIC-NEXT: movaps %xmm3, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_blendvps:
+; SLM: # BB#0:
+; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
+; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
+; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_blendvps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_blendvps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blendvps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ %2 = load <4 x float>, <4 x float> *%a3
+ %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_dppd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: dppd $7, %xmm1, %xmm0
+; GENERIC-NEXT: dppd $7, (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_dppd:
+; SLM: # BB#0:
+; SLM-NEXT: dppd $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: dppd $7, (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dppd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_dppd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_dppd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
+ %2 = load <2 x double>, <2 x double> *%a2, align 16
+ %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
+ ret <2 x double> %3
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_dpps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: dpps $7, %xmm1, %xmm0
+; GENERIC-NEXT: dpps $7, (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_dpps:
+; SLM: # BB#0:
+; SLM-NEXT: dpps $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: dpps $7, (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dpps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_dpps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_dpps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) {
+; GENERIC-LABEL: test_insertps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; GENERIC-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_insertps:
+; SLM: # BB#0:
+; SLM-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; SLM-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_insertps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_insertps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_insertps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
+; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
+ %2 = load float, float *%a2
+ %3 = insertelement <4 x float> %1, float %2, i32 3
+ ret <4 x float> %3
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x i64> @test_movntdqa(i8* %a0) {
+; GENERIC-LABEL: test_movntdqa:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movntdqa (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_movntdqa:
+; SLM: # BB#0:
+; SLM-NEXT: movntdqa (%rdi), %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movntdqa:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_movntdqa:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_movntdqa:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
+
+define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_mpsadbw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: mpsadbw $7, %xmm1, %xmm0
+; GENERIC-NEXT: mpsadbw $7, (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_mpsadbw:
+; SLM: # BB#0:
+; SLM-NEXT: mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00]
+; SLM-NEXT: mpsadbw $7, (%rdi), %xmm0 # sched: [10:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_mpsadbw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
+; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_mpsadbw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_mpsadbw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %4 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %2, <16 x i8> %3, i8 7)
+ ret <8 x i16> %4
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_packusdw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: packusdw %xmm1, %xmm0
+; GENERIC-NEXT: packusdw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_packusdw:
+; SLM: # BB#0:
+; SLM-NEXT: packusdw %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: packusdw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_packusdw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_packusdw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_packusdw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = bitcast <8 x i16> %1 to <4 x i32>
+ %3 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %4 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %2, <4 x i32> %3)
+ ret <8 x i16> %4
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> *%a3) {
+; GENERIC-LABEL: test_pblendvb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movdqa %xmm0, %xmm3
+; GENERIC-NEXT: movaps %xmm2, %xmm0
+; GENERIC-NEXT: pblendvb %xmm0, %xmm1, %xmm3
+; GENERIC-NEXT: pblendvb %xmm0, (%rdi), %xmm3
+; GENERIC-NEXT: movdqa %xmm3, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pblendvb:
+; SLM: # BB#0:
+; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
+; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
+; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pblendvb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pblendvb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pblendvb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
+ %2 = load <16 x i8>, <16 x i8> *%a3, align 16
+ %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_pblendw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7]
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pblendw:
+; SLM: # BB#0:
+; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
+; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pblendw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pblendw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
+; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pblendw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+ ret <8 x i16> %3
+}
+
+define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_pcmpeqq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pcmpeqq %xmm1, %xmm0
+; GENERIC-NEXT: pcmpeqq (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpeqq:
+; SLM: # BB#0:
+; SLM-NEXT: pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpeqq (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpeqq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpeqq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = icmp eq <2 x i64> %a0, %a1
+ %2 = sext <2 x i1> %1 to <2 x i64>
+ %3 = load <2 x i64>, <2 x i64>*%a2, align 16
+ %4 = icmp eq <2 x i64> %2, %3
+ %5 = sext <2 x i1> %4 to <2 x i64>
+ ret <2 x i64> %5
+}
+
+define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
+; GENERIC-LABEL: test_pextrb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pextrb $3, %xmm0, %eax
+; GENERIC-NEXT: pextrb $1, %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pextrb:
+; SLM: # BB#0:
+; SLM-NEXT: pextrb $3, %xmm0, %eax # sched: [1:1.00]
+; SLM-NEXT: pextrb $1, %xmm0, (%rdi) # sched: [4:2.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pextrb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pextrb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pextrb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = extractelement <16 x i8> %a0, i32 3
+ %2 = extractelement <16 x i8> %a0, i32 1
+ store i8 %2, i8 *%a1
+ %3 = zext i8 %1 to i32
+ ret i32 %3
+}
+
+define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
+; GENERIC-LABEL: test_pextrd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pextrd $3, %xmm0, %eax
+; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pextrd:
+; SLM: # BB#0:
+; SLM-NEXT: pextrd $3, %xmm0, %eax # sched: [1:1.00]
+; SLM-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [4:2.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pextrd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pextrd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pextrd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = extractelement <4 x i32> %a0, i32 3
+ %2 = extractelement <4 x i32> %a0, i32 1
+ store i32 %2, i32 *%a1
+ ret i32 %1
+}
+
+define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
+; GENERIC-LABEL: test_pextrq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pextrq $1, %xmm0, %rax
+; GENERIC-NEXT: pextrq $1, %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pextrq:
+; SLM: # BB#0:
+; SLM-NEXT: pextrq $1, %xmm0, %rax # sched: [1:1.00]
+; SLM-NEXT: pextrq $1, %xmm0, (%rdi) # sched: [4:2.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pextrq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pextrq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pextrq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = extractelement <2 x i64> %a0, i32 1
+ %2 = extractelement <2 x i64> %a0, i32 1
+ store i64 %2, i64 *%a2
+ ret i64 %1
+}
+
+define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
+; GENERIC-LABEL: test_pextrw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pextrw $3, %xmm0, %eax
+; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pextrw:
+; SLM: # BB#0:
+; SLM-NEXT: pextrw $3, %xmm0, %eax # sched: [4:1.00]
+; SLM-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [4:2.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pextrw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pextrw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pextrw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = extractelement <8 x i16> %a0, i32 3
+ %2 = extractelement <8 x i16> %a0, i32 1
+ store i16 %2, i16 *%a1
+ %3 = zext i16 %1 to i32
+ ret i32 %3
+}
+
+define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
+; GENERIC-LABEL: test_phminposuw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phminposuw (%rdi), %xmm0
+; GENERIC-NEXT: phminposuw %xmm0, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_phminposuw:
+; SLM: # BB#0:
+; SLM-NEXT: phminposuw (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: phminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phminposuw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phminposuw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phminposuw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = load <8 x i16>, <8 x i16> *%a0, align 16
+ %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
+ %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
+; GENERIC-LABEL: test_pinsrb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pinsrb $1, %edi, %xmm0
+; GENERIC-NEXT: pinsrb $3, (%rsi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pinsrb:
+; SLM: # BB#0:
+; SLM-NEXT: pinsrb $1, %edi, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pinsrb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pinsrb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pinsrb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
+ %2 = load i8, i8 *%a2
+ %3 = insertelement <16 x i8> %1, i8 %2, i32 3
+ ret <16 x i8> %3
+}
+
+define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_pinsrd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pinsrd $1, %edi, %xmm0
+; GENERIC-NEXT: pinsrd $3, (%rsi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pinsrd:
+; SLM: # BB#0:
+; SLM-NEXT: pinsrd $1, %edi, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pinsrd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pinsrd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pinsrd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
+ %2 = load i32, i32 *%a2
+ %3 = insertelement <4 x i32> %1, i32 %2, i32 3
+ ret <4 x i32> %3
+}
+
+define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
+; GENERIC-LABEL: test_pinsrq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pinsrq $1, %rdi, %xmm0
+; GENERIC-NEXT: pinsrq $1, (%rsi), %xmm1
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pinsrq:
+; SLM: # BB#0:
+; SLM-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pinsrq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pinsrq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pinsrq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
+ %2 = load i64, i64 *%a3
+ %3 = insertelement <2 x i64> %a1, i64 %2, i32 1
+ %4 = add <2 x i64> %1, %3
+ ret <2 x i64> %4
+}
+
+define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pmaxsb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmaxsb %xmm1, %xmm0
+; GENERIC-NEXT: pmaxsb (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmaxsb:
+; SLM: # BB#0:
+; SLM-NEXT: pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pmaxsb (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxsb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmaxsb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmaxsb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pmaxsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmaxsd %xmm1, %xmm0
+; GENERIC-NEXT: pmaxsd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmaxsd:
+; SLM: # BB#0:
+; SLM-NEXT: pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pmaxsd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxsd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmaxsd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmaxsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pmaxud:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmaxud %xmm1, %xmm0
+; GENERIC-NEXT: pmaxud (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmaxud:
+; SLM: # BB#0:
+; SLM-NEXT: pmaxud %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pmaxud (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxud:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmaxud:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmaxud:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_pmaxuw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmaxuw %xmm1, %xmm0
+; GENERIC-NEXT: pmaxuw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmaxuw:
+; SLM: # BB#0:
+; SLM-NEXT: pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pmaxuw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxuw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmaxuw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmaxuw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pminsb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pminsb %xmm1, %xmm0
+; GENERIC-NEXT: pminsb (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pminsb:
+; SLM: # BB#0:
+; SLM-NEXT: pminsb %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pminsb (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminsb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pminsb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pminsb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pminsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pminsd %xmm1, %xmm0
+; GENERIC-NEXT: pminsd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pminsd:
+; SLM: # BB#0:
+; SLM-NEXT: pminsd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pminsd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminsd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pminsd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pminsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pminud:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pminud %xmm1, %xmm0
+; GENERIC-NEXT: pminud (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pminud:
+; SLM: # BB#0:
+; SLM-NEXT: pminud %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pminud (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminud:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pminud:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pminud:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_pminuw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pminuw %xmm1, %xmm0
+; GENERIC-NEXT: pminuw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pminuw:
+; SLM: # BB#0:
+; SLM-NEXT: pminuw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pminuw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminuw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pminuw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pminuw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxbw %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxbw (%rdi), %xmm0
+; GENERIC-NEXT: paddw %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxbw:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxbw (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxbw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxbw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = load <8 x i8>, <8 x i8>* %a1, align 1
+ %4 = sext <8 x i8> %3 to <8 x i16>
+ %5 = add <8 x i16> %2, %4
+ ret <8 x i16> %5
+}
+
+define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxbd %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxbd (%rdi), %xmm0
+; GENERIC-NEXT: paddd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxbd:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxbd (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxbd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxbd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = load <4 x i8>, <4 x i8>* %a1, align 1
+ %4 = sext <4 x i8> %3 to <4 x i32>
+ %5 = add <4 x i32> %2, %4
+ ret <4 x i32> %5
+}
+
+define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxbq %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxbq (%rdi), %xmm0
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxbq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxbq (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxbq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxbq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %2 = sext <2 x i8> %1 to <2 x i64>
+ %3 = load <2 x i8>, <2 x i8>* %a1, align 1
+ %4 = sext <2 x i8> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
+; GENERIC-LABEL: test_pmovsxdq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxdq %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxdq (%rdi), %xmm0
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxdq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxdq (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxdq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxdq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxdq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %2 = sext <2 x i32> %1 to <2 x i64>
+ %3 = load <2 x i32>, <2 x i32>* %a1, align 1
+ %4 = sext <2 x i32> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovsxwd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxwd %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxwd (%rdi), %xmm0
+; GENERIC-NEXT: paddd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxwd:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxwd (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxwd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxwd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxwd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = load <4 x i16>, <4 x i16>* %a1, align 1
+ %4 = sext <4 x i16> %3 to <4 x i32>
+ %5 = add <4 x i32> %2, %4
+ ret <4 x i32> %5
+}
+
+define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovsxwq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovsxwq %xmm0, %xmm1
+; GENERIC-NEXT: pmovsxwq (%rdi), %xmm0
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovsxwq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovsxwq (%rdi), %xmm1 # sched: [4:1.00]
+; SLM-NEXT: pmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovsxwq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovsxwq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovsxwq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %2 = sext <2 x i16> %1 to <2 x i64>
+ %3 = load <2 x i16>, <2 x i16>* %a1, align 1
+ %4 = sext <2 x i16> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; GENERIC-NEXT: paddw %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxbw:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [4:1.00]
+; SLM-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxbw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxbw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = load <8 x i8>, <8 x i8>* %a1, align 1
+ %4 = zext <8 x i8> %3 to <8 x i16>
+ %5 = add <8 x i16> %2, %4
+ ret <8 x i16> %5
+}
+
+define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; GENERIC-NEXT: paddd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxbd:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [4:1.00]
+; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxbd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxbd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = load <4 x i8>, <4 x i8>* %a1, align 1
+ %4 = zext <4 x i8> %3 to <4 x i32>
+ %5 = add <4 x i32> %2, %4
+ ret <4 x i32> %5
+}
+
+define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxbq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [4:1.00]
+; SLM-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxbq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxbq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %2 = zext <2 x i8> %1 to <2 x i64>
+ %3 = load <2 x i8>, <2 x i8>* %a1, align 1
+ %4 = zext <2 x i8> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
+; GENERIC-LABEL: test_pmovzxdq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxdq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [4:1.00]
+; SLM-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxdq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxdq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxdq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %2 = zext <2 x i32> %1 to <2 x i64>
+ %3 = load <2 x i32>, <2 x i32>* %a1, align 1
+ %4 = zext <2 x i32> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovzxwd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; GENERIC-NEXT: paddd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxwd:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [4:1.00]
+; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxwd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50]
+; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxwd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxwd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = load <4 x i16>, <4 x i16>* %a1, align 1
+ %4 = zext <4 x i16> %3 to <4 x i32>
+ %5 = add <4 x i32> %2, %4
+ ret <4 x i32> %5
+}
+
+define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovzxwq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; GENERIC-NEXT: paddq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmovzxwq:
+; SLM: # BB#0:
+; SLM-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [4:1.00]
+; SLM-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
+; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovzxwq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmovzxwq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmovzxwq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %2 = zext <2 x i16> %1 to <2 x i64>
+ %3 = load <2 x i16>, <2 x i16>* %a1, align 1
+ %4 = zext <2 x i16> %3 to <2 x i64>
+ %5 = add <2 x i64> %2, %4
+ ret <2 x i64> %5
+}
+
+define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pmuldq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmuldq %xmm1, %xmm0
+; GENERIC-NEXT: pmuldq (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmuldq:
+; SLM: # BB#0:
+; SLM-NEXT: pmuldq %xmm1, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: pmuldq (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmuldq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmuldq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmuldq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = bitcast <2 x i64> %1 to <4 x i32>
+ %3 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %4 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %2, <4 x i32> %3)
+ ret <2 x i64> %4
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pmulld:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmulld %xmm1, %xmm0
+; GENERIC-NEXT: pmulld (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pmulld:
+; SLM: # BB#0:
+; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmulld:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmulld:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmulld:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = mul <4 x i32> %a0, %a1
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = mul <4 x i32> %1, %2
+ ret <4 x i32> %3
+}
+
+define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_ptest:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: ptest %xmm1, %xmm0
+; GENERIC-NEXT: setb %al
+; GENERIC-NEXT: ptest (%rdi), %xmm0
+; GENERIC-NEXT: setb %cl
+; GENERIC-NEXT: andb %al, %cl
+; GENERIC-NEXT: movzbl %cl, %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_ptest:
+; SLM: # BB#0:
+; SLM-NEXT: ptest %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: setb %al # sched: [1:0.50]
+; SLM-NEXT: ptest (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: setb %cl # sched: [1:0.50]
+; SLM-NEXT: andb %al, %cl # sched: [1:0.50]
+; SLM-NEXT: movzbl %cl, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ptest:
+; SANDY: # BB#0:
+; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: setb %al # sched: [1:0.33]
+; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: setb %cl # sched: [1:0.33]
+; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
+; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_ptest:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00]
+; HASWELL-NEXT: setb %cl # sched: [1:0.50]
+; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ptest:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: setb %al # sched: [1:0.50]
+; BTVER2-NEXT: vptest (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: setb %cl # sched: [1:0.50]
+; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50]
+; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
+ %4 = and i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
+; GENERIC-LABEL: test_roundpd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: roundpd $7, %xmm0, %xmm1
+; GENERIC-NEXT: roundpd $7, (%rdi), %xmm0
+; GENERIC-NEXT: addpd %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_roundpd:
+; SLM: # BB#0:
+; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00]
+; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
+; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_roundpd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundpd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundpd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
+ %2 = load <2 x double>, <2 x double> *%a1, align 16
+ %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
+ %4 = fadd <2 x double> %1, %3
+ ret <2 x double> %4
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
+; GENERIC-LABEL: test_roundps:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: roundps $7, %xmm0, %xmm1
+; GENERIC-NEXT: roundps $7, (%rdi), %xmm0
+; GENERIC-NEXT: addps %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_roundps:
+; SLM: # BB#0:
+; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00]
+; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_roundps:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundps:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundps:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_roundsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movaps %xmm0, %xmm2
+; GENERIC-NEXT: roundsd $7, %xmm1, %xmm2
+; GENERIC-NEXT: roundsd $7, (%rdi), %xmm0
+; GENERIC-NEXT: addpd %xmm2, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_roundsd:
+; SLM: # BB#0:
+; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
+; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
+; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_roundsd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundsd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
+; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
+ %2 = load <2 x double>, <2 x double>* %a2, align 16
+ %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
+ %4 = fadd <2 x double> %1, %3
+ ret <2 x double> %4
+}
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_roundss:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movaps %xmm0, %xmm2
+; GENERIC-NEXT: roundss $7, %xmm1, %xmm2
+; GENERIC-NEXT: roundss $7, (%rdi), %xmm0
+; GENERIC-NEXT: addps %xmm2, %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_roundss:
+; SLM: # BB#0:
+; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
+; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00]
+; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
+; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_roundss:
+; SANDY: # BB#0:
+; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_roundss:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
+; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_roundss:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
+ %2 = load <4 x float>, <4 x float> *%a2, align 16
+ %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
new file mode 100644
index 0000000000000..afc48bc57ee7d
--- /dev/null
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -0,0 +1,477 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+
+define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
+; GENERIC-LABEL: crc32_32_8:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: crc32b %sil, %edi
+; GENERIC-NEXT: crc32b (%rdx), %edi
+; GENERIC-NEXT: movl %edi, %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: crc32_32_8:
+; SLM: # BB#0:
+; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: crc32_32_8:
+; SANDY: # BB#0:
+; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: crc32_32_8:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: crc32_32_8:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
+ %2 = load i8, i8 *%a2
+ %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2)
+ ret i32 %3
+}
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+
+define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
+; GENERIC-LABEL: crc32_32_16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: crc32w %si, %edi
+; GENERIC-NEXT: crc32w (%rdx), %edi
+; GENERIC-NEXT: movl %edi, %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: crc32_32_16:
+; SLM: # BB#0:
+; SLM-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; SLM-NEXT: crc32w (%rdx), %edi # sched: [6:1.00]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: crc32_32_16:
+; SANDY: # BB#0:
+; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: crc32_32_16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: crc32_32_16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
+ %2 = load i16, i16 *%a2
+ %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2)
+ ret i32 %3
+}
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+
+define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: crc32_32_32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: crc32l %esi, %edi
+; GENERIC-NEXT: crc32l (%rdx), %edi
+; GENERIC-NEXT: movl %edi, %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: crc32_32_32:
+; SLM: # BB#0:
+; SLM-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; SLM-NEXT: crc32l (%rdx), %edi # sched: [6:1.00]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: crc32_32_32:
+; SANDY: # BB#0:
+; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: crc32_32_32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: crc32_32_32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+ %2 = load i32, i32 *%a2
+ %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2)
+ ret i32 %3
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
+; GENERIC-LABEL: crc32_64_8:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: crc32b %sil, %edi
+; GENERIC-NEXT: crc32b (%rdx), %edi
+; GENERIC-NEXT: movq %rdi, %rax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: crc32_64_8:
+; SLM: # BB#0:
+; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: crc32_64_8:
+; SANDY: # BB#0:
+; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: crc32_64_8:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: crc32_64_8:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
+ %2 = load i8, i8 *%a2
+ %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2)
+ ret i64 %3
+}
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+
+define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: crc32_64_64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: crc32q %rsi, %rdi
+; GENERIC-NEXT: crc32q (%rdx), %rdi
+; GENERIC-NEXT: movq %rdi, %rax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: crc32_64_64:
+; SLM: # BB#0:
+; SLM-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; SLM-NEXT: crc32q (%rdx), %rdi # sched: [6:1.00]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: crc32_64_64:
+; SANDY: # BB#0:
+; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: crc32_64_64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: crc32_64_64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+ %2 = load i64, i64 *%a2
+ %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2)
+ ret i64 %3
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
+define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpestri:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movl $7, %eax
+; GENERIC-NEXT: movl $7, %edx
+; GENERIC-NEXT: pcmpestri $7, %xmm1, %xmm0
+; GENERIC-NEXT: movl %ecx, %esi
+; GENERIC-NEXT: movl $7, %eax
+; GENERIC-NEXT: movl $7, %edx
+; GENERIC-NEXT: pcmpestri $7, (%rdi), %xmm0
+; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; GENERIC-NEXT: leal (%rcx,%rsi), %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpestri:
+; SLM: # BB#0:
+; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
+; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
+; SLM-NEXT: pcmpestri $7, %xmm1, %xmm0 # sched: [21:21.00]
+; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
+; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
+; SLM-NEXT: movl %ecx, %esi # sched: [1:0.50]
+; SLM-NEXT: pcmpestri $7, (%rdi), %xmm0 # sched: [21:21.00]
+; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SLM-NEXT: leal (%rcx,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpestri:
+; SANDY: # BB#0:
+; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
+; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
+; SANDY-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
+; SANDY-NEXT: movl %ecx, %esi # sched: [1:0.33]
+; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
+; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
+; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpestri:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25]
+; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpestri:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
+; BTVER2-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [13:2.50]
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
+; BTVER2-NEXT: movl %ecx, %esi # sched: [1:0.17]
+; BTVER2-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:2.50]
+; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpestrm:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movl $7, %eax
+; GENERIC-NEXT: movl $7, %edx
+; GENERIC-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; GENERIC-NEXT: movl $7, %eax
+; GENERIC-NEXT: movl $7, %edx
+; GENERIC-NEXT: pcmpestrm $7, (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpestrm:
+; SLM: # BB#0:
+; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
+; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
+; SLM-NEXT: pcmpestrm $7, %xmm1, %xmm0 # sched: [17:17.00]
+; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
+; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
+; SLM-NEXT: pcmpestrm $7, (%rdi), %xmm0 # sched: [17:17.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpestrm:
+; SANDY: # BB#0:
+; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
+; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
+; SANDY-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
+; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
+; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
+; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpestrm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpestrm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
+; BTVER2-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [13:2.50]
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
+; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpistri:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pcmpistri $7, %xmm1, %xmm0
+; GENERIC-NEXT: movl %ecx, %eax
+; GENERIC-NEXT: pcmpistri $7, (%rdi), %xmm0
+; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; GENERIC-NEXT: leal (%rcx,%rax), %eax
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpistri:
+; SLM: # BB#0:
+; SLM-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [17:17.00]
+; SLM-NEXT: movl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: pcmpistri $7, (%rdi), %xmm0 # sched: [17:17.00]
+; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SLM-NEXT: leal (%rcx,%rax), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpistri:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpistri:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: movl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpistri:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: movl %ecx, %eax # sched: [1:0.17]
+; BTVER2-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:1.00]
+; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7)
+ %4 = add i32 %1, %3
+ ret i32 %4
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpistrm:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; GENERIC-NEXT: pcmpistrm $7, (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpistrm:
+; SLM: # BB#0:
+; SLM-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [13:13.00]
+; SLM-NEXT: pcmpistrm $7, (%rdi), %xmm0 # sched: [13:13.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpistrm:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpistrm:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
+; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpistrm:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_pcmpgtq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pcmpgtq %xmm1, %xmm0
+; GENERIC-NEXT: pcmpgtq (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; SLM-LABEL: test_pcmpgtq:
+; SLM: # BB#0:
+; SLM-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpgtq (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpgtq:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtq:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pcmpgtq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = icmp sgt <2 x i64> %a0, %a1
+ %2 = sext <2 x i1> %1 to <2 x i64>
+ %3 = load <2 x i64>, <2 x i64>*%a2, align 16
+ %4 = icmp sgt <2 x i64> %2, %3
+ %5 = sext <2 x i1> %4 to <2 x i64>
+ ret <2 x i64> %5
+}
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
new file mode 100644
index 0000000000000..8b7a0c0ec02b6
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -0,0 +1,754 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+
+define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pabsb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pabsb %xmm0, %xmm1
+; GENERIC-NEXT: pabsb (%rdi), %xmm0
+; GENERIC-NEXT: por %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pabsb:
+; ATOM: # BB#0:
+; ATOM-NEXT: pabsb (%rdi), %xmm1
+; ATOM-NEXT: pabsb %xmm0, %xmm0
+; ATOM-NEXT: por %xmm0, %xmm1
+; ATOM-NEXT: movdqa %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pabsb:
+; SLM: # BB#0:
+; SLM-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: pabsb (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pabsb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pabsb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpabsb (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
+ %2 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2)
+ %4 = or <16 x i8> %1, %3
+ ret <16 x i8> %4
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_pabsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pabsd %xmm0, %xmm1
+; GENERIC-NEXT: pabsd (%rdi), %xmm0
+; GENERIC-NEXT: por %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pabsd:
+; ATOM: # BB#0:
+; ATOM-NEXT: pabsd (%rdi), %xmm1
+; ATOM-NEXT: pabsd %xmm0, %xmm0
+; ATOM-NEXT: por %xmm0, %xmm1
+; ATOM-NEXT: movdqa %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pabsd:
+; SLM: # BB#0:
+; SLM-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: pabsd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pabsd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pabsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpabsd (%rdi), %xmm1 # sched: [6:1.00]
+; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
+ %2 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2)
+ %4 = or <4 x i32> %1, %3
+ ret <4 x i32> %4
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pabsw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pabsw %xmm0, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pabsw:
+; ATOM: # BB#0:
+; ATOM-NEXT: pabsw %xmm0, %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pabsw:
+; SLM: # BB#0:
+; SLM-NEXT: pabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pabsw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pabsw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
+ %2 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2)
+ %4 = or <8 x i16> %1, %3
+ ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_palignr:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; GENERIC-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; GENERIC-NEXT: movdqa %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_palignr:
+; ATOM: # BB#0:
+; ATOM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; ATOM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; ATOM-NEXT: movdqa %xmm1, %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_palignr:
+; SLM: # BB#0:
+; SLM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; SLM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [4:1.00]
+; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_palignr:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_palignr:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_palignr:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_phaddd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phaddd %xmm1, %xmm0
+; GENERIC-NEXT: phaddd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phaddd:
+; ATOM: # BB#0:
+; ATOM-NEXT: phaddd %xmm1, %xmm0
+; ATOM-NEXT: phaddd (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phaddd:
+; SLM: # BB#0:
+; SLM-NEXT: phaddd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phaddd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phaddd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phaddd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_phaddsw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phaddsw %xmm1, %xmm0
+; GENERIC-NEXT: phaddsw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phaddsw:
+; ATOM: # BB#0:
+; ATOM-NEXT: phaddsw %xmm1, %xmm0
+; ATOM-NEXT: phaddsw (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phaddsw:
+; SLM: # BB#0:
+; SLM-NEXT: phaddsw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phaddsw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddsw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phaddsw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phaddsw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_phaddw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phaddw %xmm1, %xmm0
+; GENERIC-NEXT: phaddw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phaddw:
+; ATOM: # BB#0:
+; ATOM-NEXT: phaddw %xmm1, %xmm0
+; ATOM-NEXT: phaddw (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phaddw:
+; SLM: # BB#0:
+; SLM-NEXT: phaddw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phaddw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phaddw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phaddw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_phsubd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phsubd %xmm1, %xmm0
+; GENERIC-NEXT: phsubd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phsubd:
+; ATOM: # BB#0:
+; ATOM-NEXT: phsubd %xmm1, %xmm0
+; ATOM-NEXT: phsubd (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phsubd:
+; SLM: # BB#0:
+; SLM-NEXT: phsubd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phsubd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phsubd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phsubd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_phsubsw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phsubsw %xmm1, %xmm0
+; GENERIC-NEXT: phsubsw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phsubsw:
+; ATOM: # BB#0:
+; ATOM-NEXT: phsubsw %xmm1, %xmm0
+; ATOM-NEXT: phsubsw (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phsubsw:
+; SLM: # BB#0:
+; SLM-NEXT: phsubsw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phsubsw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubsw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phsubsw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phsubsw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_phsubw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: phsubw %xmm1, %xmm0
+; GENERIC-NEXT: phsubw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_phsubw:
+; ATOM: # BB#0:
+; ATOM-NEXT: phsubw %xmm1, %xmm0
+; ATOM-NEXT: phsubw (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_phsubw:
+; SLM: # BB#0:
+; SLM-NEXT: phsubw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: phsubw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_phsubw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_phsubw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pmaddubsw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmaddubsw %xmm1, %xmm0
+; GENERIC-NEXT: pmaddubsw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pmaddubsw:
+; ATOM: # BB#0:
+; ATOM-NEXT: pmaddubsw %xmm1, %xmm0
+; ATOM-NEXT: pmaddubsw (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pmaddubsw:
+; SLM: # BB#0:
+; SLM-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaddubsw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmaddubsw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmaddubsw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = bitcast <8 x i16> %1 to <16 x i8>
+ %4 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %3, <16 x i8> %2)
+ ret <8 x i16> %4
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_pmulhrsw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pmulhrsw %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pmulhrsw:
+; ATOM: # BB#0:
+; ATOM-NEXT: pmulhrsw %xmm1, %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pmulhrsw:
+; SLM: # BB#0:
+; SLM-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmulhrsw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pmulhrsw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pmulhrsw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_pshufb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: pshufb %xmm1, %xmm0
+; GENERIC-NEXT: pshufb (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_pshufb:
+; ATOM: # BB#0:
+; ATOM-NEXT: pshufb %xmm1, %xmm0
+; ATOM-NEXT: pshufb (%rdi), %xmm0
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_pshufb:
+; SLM: # BB#0:
+; SLM-NEXT: pshufb %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: pshufb (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pshufb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pshufb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_pshufb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
+; GENERIC-LABEL: test_psignb:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: psignb %xmm1, %xmm0
+; GENERIC-NEXT: psignb (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_psignb:
+; ATOM: # BB#0:
+; ATOM-NEXT: psignb %xmm1, %xmm0
+; ATOM-NEXT: psignb (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_psignb:
+; SLM: # BB#0:
+; SLM-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: psignb (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignb:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_psignb:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_psignb:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
+ %2 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2)
+ ret <16 x i8> %3
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psignd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: psignd %xmm1, %xmm0
+; GENERIC-NEXT: psignd (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_psignd:
+; ATOM: # BB#0:
+; ATOM-NEXT: psignd %xmm1, %xmm0
+; ATOM-NEXT: psignd (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_psignd:
+; SLM: # BB#0:
+; SLM-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: psignd (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignd:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_psignd:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_psignd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_psignw:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: psignw %xmm1, %xmm0
+; GENERIC-NEXT: psignw (%rdi), %xmm0
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test_psignw:
+; ATOM: # BB#0:
+; ATOM-NEXT: psignw %xmm1, %xmm0
+; ATOM-NEXT: psignw (%rdi), %xmm0
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
+;
+; SLM-LABEL: test_psignw:
+; SLM: # BB#0:
+; SLM-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: psignw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignw:
+; SANDY: # BB#0:
+; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_psignw:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_psignw:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)
+ ret <8 x i16> %3
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index cc384e19394f2..15fb25777eddc 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -22,7 +22,7 @@ define <2 x i8 addrspace(1)*> @test2(<2 x i8 addrspace(1)*> %obj, i64 %offset) g
entry:
; CHECK-LABEL: @test2
; CHECK: subq $40, %rsp
-; CHECK: movd %rdi, %xmm1
+; CHECK: movq %rdi, %xmm1
; CHECK: pshufd $68, %xmm1, %xmm1 # xmm1 = xmm1[0,1,0,1]
; CHECK: paddq %xmm0, %xmm1
; CHECK: movdqa %xmm0, 16(%rsp)
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 805bc25c17b62..ac0b43b2402f8 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
@i = thread_local global i32 15
@j = internal thread_local global i32 42
@@ -11,9 +11,9 @@ entry:
ret i32 %tmp1
}
-; X32-LABEL: f1:
-; X32: leal i@TLSGD(,%ebx), %eax
-; X32: calll ___tls_get_addr@PLT
+; X86-LABEL: f1:
+; X86: leal i@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
; X64-LABEL: f1:
; X64: leaq i@TLSGD(%rip), %rdi
@@ -27,9 +27,9 @@ entry:
ret i32* @i
}
-; X32-LABEL: f2:
-; X32: leal i@TLSGD(,%ebx), %eax
-; X32: calll ___tls_get_addr@PLT
+; X86-LABEL: f2:
+; X86: leal i@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
; X64-LABEL: f2:
; X64: leaq i@TLSGD(%rip), %rdi
@@ -43,9 +43,9 @@ entry:
ret i32 %tmp1
}
-; X32-LABEL: f3:
-; X32: leal i@TLSGD(,%ebx), %eax
-; X32: calll ___tls_get_addr@PLT
+; X86-LABEL: f3:
+; X86: leal i@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
; X64-LABEL: f3:
; X64: leaq i@TLSGD(%rip), %rdi
@@ -57,9 +57,9 @@ entry:
ret i32* @i
}
-; X32-LABEL: f4:
-; X32: leal i@TLSGD(,%ebx), %eax
-; X32: calll ___tls_get_addr@PLT
+; X86-LABEL: f4:
+; X86: leal i@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
; X64-LABEL: f4:
; X64: leaq i@TLSGD(%rip), %rdi
@@ -74,11 +74,11 @@ entry:
ret i32 %add
}
-; X32-LABEL: f5:
-; X32: leal {{[jk]}}@TLSLDM(%ebx)
-; X32: calll ___tls_get_addr@PLT
-; X32: movl {{[jk]}}@DTPOFF(%e
-; X32: addl {{[jk]}}@DTPOFF(%e
+; X86-LABEL: f5:
+; X86: leal {{[jk]}}@TLSLDM(%ebx)
+; X86: calll ___tls_get_addr@PLT
+; X86: movl {{[jk]}}@DTPOFF(%e
+; X86: addl {{[jk]}}@DTPOFF(%e
; X64-LABEL: f5:
; X64: leaq {{[jk]}}@TLSLD(%rip), %rdi
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 842a3bab66473..7a7e40362bcf9 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,81 +1,112 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
-; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
-; RUN: | FileCheck -check-prefix=X64 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnux32 -relocation-model=pic | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
@i = thread_local global i32 15
@i2 = external thread_local global i32
define i32 @f1() {
+; X86-LABEL: f1:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl %gs:i@NTPOFF, %eax
+; X86-NEXT: retl
+;
; X32-LABEL: f1:
-; X32: movl %gs:i@NTPOFF, %eax
-; X32-NEXT: ret
+; X32: # BB#0: # %entry
+; X32-NEXT: movl %fs:i@TPOFF, %eax
+; X32-NEXT: retq
+;
; X64-LABEL: f1:
-; X64: movl %fs:i@TPOFF, %eax
-; X64-NEXT: ret
-
+; X64: # BB#0: # %entry
+; X64-NEXT: movl %fs:i@TPOFF, %eax
+; X64-NEXT: retq
entry:
%tmp1 = load i32, i32* @i
ret i32 %tmp1
}
define i32* @f2() {
+; X86-LABEL: f2:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl %gs:0, %eax
+; X86-NEXT: leal i@NTPOFF(%eax), %eax
+; X86-NEXT: retl
+;
; X32-LABEL: f2:
-; X32: movl %gs:0, %eax
-; X32-NEXT: leal i@NTPOFF(%eax), %eax
-; X32-NEXT: ret
+; X32: # BB#0: # %entry
+; X32-NEXT: movl %fs:0, %eax
+; X32-NEXT: leal i@TPOFF(%rax), %eax
+; X32-NEXT: retq
+;
; X64-LABEL: f2:
-; X64: movq %fs:0, %rax
-; X64-NEXT: leaq i@TPOFF(%rax), %rax
-; X64-NEXT: ret
-
+; X64: # BB#0: # %entry
+; X64-NEXT: movq %fs:0, %rax
+; X64-NEXT: leaq i@TPOFF(%rax), %rax
+; X64-NEXT: retq
entry:
ret i32* @i
}
define i32 @f3() {
+; X86-LABEL: f3:
+; X86: # BB#0: # %entry
+; X86-NEXT: calll .L2$pb
+; X86-NEXT: .Lcfi0:
+; X86-NEXT: .cfi_adjust_cfa_offset 4
+; X86-NEXT: .L2$pb:
+; X86-NEXT: popl %eax
+; X86-NEXT: .Lcfi1:
+; X86-NEXT: .cfi_adjust_cfa_offset -4
+; X86-NEXT: .Ltmp0:
+; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L2$pb), %eax
+; X86-NEXT: movl i2@GOTNTPOFF(%eax), %eax
+; X86-NEXT: movl %gs:(%eax), %eax
+; X86-NEXT: retl
+;
; X32-LABEL: f3:
-; X32: calll .L{{[0-9]+}}$pb
-; X32-NEXT: .Lcfi{{[0-9]+}}:
-; X32-NEXT: .cfi_adjust_cfa_offset 4
-; X32-NEXT: .L{{[0-9]+}}$pb:
-; X32-NEXT: popl %eax
-; X32-NEXT: .Lcfi{{[0-9]+}}:
-; X32-NEXT: .cfi_adjust_cfa_offset -4
-; X32-NEXT: .Ltmp{{[0-9]+}}:
-; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
-; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
-; X32-NEXT: movl %gs:(%eax), %eax
-; X32-NEXT: ret
+; X32: # BB#0: # %entry
+; X32-NEXT: movl i2@{{.*}}(%rip), %eax
+; X32-NEXT: movl %fs:(%eax), %eax
+; X32-NEXT: retq
+;
; X64-LABEL: f3:
-; X64: movq i2@GOTTPOFF(%rip), %rax
-; X64-NEXT: movl %fs:(%rax), %eax
-; X64-NEXT: ret
-
+; X64: # BB#0: # %entry
+; X64-NEXT: movq i2@{{.*}}(%rip), %rax
+; X64-NEXT: movl %fs:(%rax), %eax
+; X64-NEXT: retq
entry:
%tmp1 = load i32, i32* @i2
ret i32 %tmp1
}
define i32* @f4() {
+; X86-LABEL: f4:
+; X86: # BB#0: # %entry
+; X86-NEXT: calll .L3$pb
+; X86-NEXT: .Lcfi2:
+; X86-NEXT: .cfi_adjust_cfa_offset 4
+; X86-NEXT: .L3$pb:
+; X86-NEXT: popl %ecx
+; X86-NEXT: .Lcfi3:
+; X86-NEXT: .cfi_adjust_cfa_offset -4
+; X86-NEXT: .Ltmp1:
+; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L3$pb), %ecx
+; X86-NEXT: movl %gs:0, %eax
+; X86-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
+; X86-NEXT: retl
+;
; X32-LABEL: f4:
-; X32: calll .L{{[0-9]+}}$pb
-; X32-NEXT: .Lcfi{{[0-9]+}}:
-; X32-NEXT: .cfi_adjust_cfa_offset 4
-; X32-NEXT: .L{{[0-9]+}}$pb:
-; X32-NEXT: popl %ecx
-; X32-NEXT: .Lcfi{{[0-9]+}}:
-; X32-NEXT: .cfi_adjust_cfa_offset -4
-; X32-NEXT: .Ltmp{{[0-9]+}}:
-; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
-; X32-NEXT: movl %gs:0, %eax
-; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
-; X32-NEXT: ret
+; X32: # BB#0: # %entry
+; X32-NEXT: movl %fs:0, %eax
+; X32-NEXT: addl i2@{{.*}}(%rip), %eax
+; X32-NEXT: retq
+;
; X64-LABEL: f4:
-; X64: movq %fs:0, %rax
-; X64-NEXT: addq i2@GOTTPOFF(%rip), %rax
-; X64-NEXT: ret
-
+; X64: # BB#0: # %entry
+; X64-NEXT: movq %fs:0, %rax
+; X64-NEXT: addq i2@{{.*}}(%rip), %rax
+; X64-NEXT: retq
entry:
ret i32* @i2
}
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 85c51e618b2a7..d39716aab7643 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32_LINUX %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
-; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X32_WIN %s
+; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X86_WIN %s
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
@@ -16,18 +16,18 @@
@b2 = thread_local(localexec) global i8 0
define i32 @f1() {
-; X32_LINUX-LABEL: f1:
-; X32_LINUX: movl %gs:i1@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f1:
+; X86_LINUX: movl %gs:i1@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f1:
; X64_LINUX: movl %fs:i1@TPOFF, %eax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f1:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movl _i1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f1:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movl _i1@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f1:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -47,20 +47,20 @@ entry:
}
define i32* @f2() {
-; X32_LINUX-LABEL: f2:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal i1@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f2:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal i1@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f2:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: leaq i1@TPOFF(%rax), %rax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f2:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: leal _i1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f2:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: leal _i1@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f2:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -79,20 +79,20 @@ entry:
}
define i32 @f3() nounwind {
-; X32_LINUX-LABEL: f3:
-; X32_LINUX: movl i2@INDNTPOFF, %eax
-; X32_LINUX-NEXT: movl %gs:(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f3:
+; X86_LINUX: movl i2@INDNTPOFF, %eax
+; X86_LINUX-NEXT: movl %gs:(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f3:
; X64_LINUX: movq i2@GOTTPOFF(%rip), %rax
; X64_LINUX-NEXT: movl %fs:(%rax), %eax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f3:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movl _i2@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f3:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movl _i2@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f3:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -112,20 +112,20 @@ entry:
}
define i32* @f4() {
-; X32_LINUX-LABEL: f4:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: addl i2@INDNTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f4:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: addl i2@INDNTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f4:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: addq i2@GOTTPOFF(%rip), %rax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f4:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: leal _i2@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f4:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: leal _i2@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f4:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -144,18 +144,18 @@ entry:
}
define i32 @f5() nounwind {
-; X32_LINUX-LABEL: f5:
-; X32_LINUX: movl %gs:i3@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f5:
+; X86_LINUX: movl %gs:i3@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f5:
; X64_LINUX: movl %fs:i3@TPOFF, %eax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f5:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movl _i3@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f5:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movl _i3@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f5:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -175,20 +175,20 @@ entry:
}
define i32* @f6() {
-; X32_LINUX-LABEL: f6:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal i3@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f6:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal i3@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f6:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: leaq i3@TPOFF(%rax), %rax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f6:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: leal _i3@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f6:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: leal _i3@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f6:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -207,9 +207,9 @@ entry:
}
define i32 @f7() {
-; X32_LINUX-LABEL: f7:
-; X32_LINUX: movl %gs:i4@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f7:
+; X86_LINUX: movl %gs:i4@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f7:
; X64_LINUX: movl %fs:i4@TPOFF, %eax
; X64_LINUX-NEXT: ret
@@ -226,10 +226,10 @@ entry:
}
define i32* @f8() {
-; X32_LINUX-LABEL: f8:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal i4@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f8:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal i4@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f8:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax
@@ -246,9 +246,9 @@ entry:
}
define i32 @f9() {
-; X32_LINUX-LABEL: f9:
-; X32_LINUX: movl %gs:i5@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f9:
+; X86_LINUX: movl %gs:i5@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f9:
; X64_LINUX: movl %fs:i5@TPOFF, %eax
; X64_LINUX-NEXT: ret
@@ -265,10 +265,10 @@ entry:
}
define i32* @f10() {
-; X32_LINUX-LABEL: f10:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal i5@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f10:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal i5@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f10:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: leaq i5@TPOFF(%rax), %rax
@@ -285,18 +285,18 @@ entry:
}
define i16 @f11() {
-; X32_LINUX-LABEL: f11:
-; X32_LINUX: movzwl %gs:s1@NTPOFF, %eax
-; X32_LINUX: ret
+; X86_LINUX-LABEL: f11:
+; X86_LINUX: movzwl %gs:s1@NTPOFF, %eax
+; X86_LINUX: ret
; X64_LINUX-LABEL: f11:
; X64_LINUX: movzwl %fs:s1@TPOFF, %eax
; X64_LINUX: ret
-; X32_WIN-LABEL: f11:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax
-; X32_WIN: ret
+; X86_WIN-LABEL: f11:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax
+; X86_WIN: ret
; X64_WIN-LABEL: f11:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -316,18 +316,18 @@ entry:
}
define i32 @f12() {
-; X32_LINUX-LABEL: f12:
-; X32_LINUX: movswl %gs:s1@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f12:
+; X86_LINUX: movswl %gs:s1@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f12:
; X64_LINUX: movswl %fs:s1@TPOFF, %eax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f12:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movswl _s1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f12:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movswl _s1@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f12:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -349,18 +349,18 @@ entry:
}
define i8 @f13() {
-; X32_LINUX-LABEL: f13:
-; X32_LINUX: movb %gs:b1@NTPOFF, %al
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f13:
+; X86_LINUX: movb %gs:b1@NTPOFF, %al
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f13:
; X64_LINUX: movb %fs:b1@TPOFF, %al
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f13:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movb _b1@SECREL32(%eax), %al
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f13:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movb _b1@SECREL32(%eax), %al
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f13:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -380,18 +380,18 @@ entry:
}
define i32 @f14() {
-; X32_LINUX-LABEL: f14:
-; X32_LINUX: movsbl %gs:b1@NTPOFF, %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f14:
+; X86_LINUX: movsbl %gs:b1@NTPOFF, %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f14:
; X64_LINUX: movsbl %fs:b1@TPOFF, %eax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f14:
-; X32_WIN: movl __tls_index, %eax
-; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
-; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
-; X32_WIN-NEXT: movsbl _b1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f14:
+; X86_WIN: movl __tls_index, %eax
+; X86_WIN-NEXT: movl %fs:__tls_array, %ecx
+; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax
+; X86_WIN-NEXT: movsbl _b1@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f14:
; X64_WIN: movl _tls_index(%rip), %eax
; X64_WIN-NEXT: movq %gs:88, %rcx
@@ -412,19 +412,19 @@ entry:
}
define i8* @f15() {
-; X32_LINUX-LABEL: f15:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal b2@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f15:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal b2@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f15:
; X64_LINUX: movq %fs:0, %rax
; X64_LINUX-NEXT: leaq b2@TPOFF(%rax), %rax
; X64_LINUX-NEXT: ret
-; X32_WIN-LABEL: f15:
-; X32_WIN: movl %fs:__tls_array, %eax
-; X32_WIN-NEXT: movl (%eax), %eax
-; X32_WIN-NEXT: leal _b2@SECREL32(%eax), %eax
-; X32_WIN-NEXT: ret
+; X86_WIN-LABEL: f15:
+; X86_WIN: movl %fs:__tls_array, %eax
+; X86_WIN-NEXT: movl (%eax), %eax
+; X86_WIN-NEXT: leal _b2@SECREL32(%eax), %eax
+; X86_WIN-NEXT: ret
; X64_WIN-LABEL: f15:
; X64_WIN: movq %gs:88, %rax
; X64_WIN-NEXT: movq (%rax), %rax
@@ -441,10 +441,10 @@ entry:
define i32* @f16() {
-; X32_LINUX-LABEL: f16:
-; X32_LINUX: movl %gs:0, %eax
-; X32_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax
-; X32_LINUX-NEXT: ret
+; X86_LINUX-LABEL: f16:
+; X86_LINUX: movl %gs:0, %eax
+; X86_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
; X64_LINUX-LABEL: f16:
; X64_LINUX: movq %fs:0, %rax
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index 78799ff04fe1c..9804f0ef983ba 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -10,7 +10,7 @@
define <4 x float> @t1(<4 x float> %Q) nounwind {
; X32-SSE-LABEL: t1:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: xorps .LCPI0_0, %xmm0
+; X32-SSE-NEXT: xorps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: t1:
@@ -92,7 +92,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
; X64-SSE2-NEXT: xorq %rdi, %rax
-; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: retq
%bitcast = bitcast i64 %i to <2 x float>
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index a345f78e18c13..477150016486b 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -20,10 +20,10 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i64:
; SSE: # BB#0:
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -128,16 +128,16 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_4i64:
; SSE: # BB#0:
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movq %rax, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
@@ -263,7 +263,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subsd %xmm2, %xmm3
@@ -272,7 +272,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -347,7 +347,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subsd %xmm2, %xmm3
@@ -356,7 +356,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
@@ -428,7 +428,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subsd %xmm1, %xmm3
@@ -437,7 +437,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: ucomisd %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE-NEXT: retq
@@ -507,7 +507,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subsd %xmm2, %xmm3
@@ -516,13 +516,13 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm0
+; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovbq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, %xmm0
@@ -586,7 +586,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm2, %rdx
; SSE-NEXT: ucomisd %xmm3, %xmm2
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm0
+; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: movaps %xmm2, %xmm4
; SSE-NEXT: subsd %xmm3, %xmm4
@@ -595,7 +595,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm2, %rdx
; SSE-NEXT: ucomisd %xmm3, %xmm2
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT: movapd %xmm1, %xmm2
; SSE-NEXT: subsd %xmm3, %xmm2
@@ -604,7 +604,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm3, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: subsd %xmm3, %xmm4
@@ -613,7 +613,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm1, %rax
; SSE-NEXT: ucomisd %xmm3, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
@@ -761,7 +761,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: subsd %xmm2, %xmm4
@@ -770,7 +770,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: subsd %xmm2, %xmm1
@@ -779,7 +779,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm4
; SSE-NEXT: subsd %xmm2, %xmm4
@@ -788,7 +788,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
; SSE-NEXT: movaps %xmm1, %xmm0
@@ -879,10 +879,10 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptosi_2f32_to_2i64:
; SSE: # BB#0:
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -940,10 +940,10 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptosi_4f32_to_2i64:
; SSE: # BB#0:
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1016,19 +1016,19 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptosi_4f32_to_4i64:
; SSE: # BB#0:
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movq %rax, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
@@ -1124,19 +1124,19 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptosi_8f32_to_4i64:
; SSE: # BB#0:
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movq %rax, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
@@ -1245,7 +1245,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rdx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subss %xmm2, %xmm3
@@ -1254,7 +1254,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rcx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1390,7 +1390,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rdx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subss %xmm2, %xmm3
@@ -1399,7 +1399,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rcx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1477,7 +1477,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rdx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movq %rdx, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: subss %xmm2, %xmm3
@@ -1486,7 +1486,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rcx
; SSE-NEXT: ucomiss %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movd %rcx, %xmm0
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1685,7 +1685,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
; SSE-NEXT: movaps %xmm3, %xmm4
@@ -1695,7 +1695,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm3, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
@@ -1706,7 +1706,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm3, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm4
; SSE-NEXT: subss %xmm1, %xmm4
@@ -1715,7 +1715,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
@@ -1863,7 +1863,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
; SSE-NEXT: movaps %xmm3, %xmm4
@@ -1873,7 +1873,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm3, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
@@ -1884,7 +1884,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm3, %rdx
; SSE-NEXT: ucomiss %xmm1, %xmm3
; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movq %rdx, %xmm3
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movaps %xmm0, %xmm4
; SSE-NEXT: subss %xmm1, %xmm4
@@ -1893,7 +1893,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
@@ -2257,9 +2257,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; SSE-NEXT: movzwl %ax, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
@@ -2407,12 +2407,12 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-NEXT: movq %rdx, %rdi
; SSE-NEXT: movq %rcx, %rsi
; SSE-NEXT: callq __fixtfdi
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT: movq %rbx, %rdi
; SSE-NEXT: movq %r14, %rsi
; SSE-NEXT: callq __fixtfdi
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
; SSE-NEXT: xorps %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index 2d55ffbd6e7a6..ff8b1f14c52de 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -15,7 +15,7 @@ define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
;
; X64-LABEL: t1:
; X64: # BB#0:
-; X64-NEXT: movd %rdi, %xmm1
+; X64-NEXT: movq %rdi, %xmm1
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index a37c377e890e5..e7c06a99df9cc 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -19,7 +19,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
; X64: # BB#0:
; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-NEXT: shll $12, %edi
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index 143957e29ed67..fffafe7697dad 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -17,7 +17,7 @@ define x86_mmx @t0(i32 %A) nounwind {
; X64-LABEL: t0:
; X64: ## BB#0:
; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 649b45712f578..a42b3c96c3ae6 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -19,10 +19,10 @@
define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_2f64:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -217,17 +217,17 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f64:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
@@ -1047,10 +1047,10 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_4f32:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1111,10 +1111,10 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_4f32_zero:
; SSE: # BB#0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1170,11 +1170,11 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f32_undef:
; SSE: # BB#0:
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -1367,17 +1367,17 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f32:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1610,7 +1610,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-LABEL: uitofp_2i64_to_4f32:
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_1
; SSE-NEXT: # BB#2:
@@ -1627,7 +1627,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB39_3:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_4
; SSE-NEXT: # BB#5:
@@ -1729,7 +1729,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE-LABEL: uitofp_2i64_to_2f32:
; SSE: # BB#0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB40_1
; SSE-NEXT: # BB#2:
@@ -1745,7 +1745,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB40_3:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB40_4
; SSE-NEXT: # BB#5:
@@ -1845,7 +1845,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: .LBB41_2:
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB41_3
; SSE-NEXT: # BB#4:
@@ -1863,7 +1863,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: .LBB41_5:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB41_6
; SSE-NEXT: # BB#7:
@@ -2145,7 +2145,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-LABEL: uitofp_4i64_to_4f32:
; SSE: # BB#0:
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_1
; SSE-NEXT: # BB#2:
@@ -2159,7 +2159,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB47_3:
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_4
; SSE-NEXT: # BB#5:
@@ -2174,7 +2174,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB47_6:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_7
; SSE-NEXT: # BB#8:
@@ -2192,7 +2192,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: .LBB47_9:
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_10
; SSE-NEXT: # BB#11:
@@ -2591,10 +2591,10 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE-LABEL: sitofp_load_2i64_to_2f64:
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2733,18 +2733,18 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -3382,17 +3382,17 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3549,34 +3549,34 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm3
; SSE-NEXT: movdqa 48(%rdi), %xmm4
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movd %xmm4, %rax
+; SSE-NEXT: movq %xmm4, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: xorps %xmm3, %xmm3
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -3824,7 +3824,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm3
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_1
; SSE-NEXT: # BB#2:
@@ -3838,7 +3838,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB76_3:
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_4
; SSE-NEXT: # BB#5:
@@ -3853,7 +3853,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB76_6:
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_7
; SSE-NEXT: # BB#8:
@@ -3871,7 +3871,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: .LBB76_9:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_10
; SSE-NEXT: # BB#11:
@@ -4190,7 +4190,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movdqa 16(%rdi), %xmm5
; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_1
; SSE-NEXT: # BB#2:
@@ -4204,7 +4204,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
; SSE-NEXT: addss %xmm4, %xmm4
; SSE-NEXT: .LBB80_3:
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_4
; SSE-NEXT: # BB#5:
@@ -4219,7 +4219,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB80_6:
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_7
; SSE-NEXT: # BB#8:
@@ -4234,7 +4234,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: addss %xmm6, %xmm6
; SSE-NEXT: .LBB80_9:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_10
; SSE-NEXT: # BB#11:
@@ -4250,7 +4250,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
; SSE-NEXT: addss %xmm5, %xmm5
; SSE-NEXT: .LBB80_12:
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_13
; SSE-NEXT: # BB#14:
@@ -4264,7 +4264,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm7
; SSE-NEXT: addss %xmm7, %xmm7
; SSE-NEXT: .LBB80_15:
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_16
; SSE-NEXT: # BB#17:
@@ -4283,7 +4283,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_19
; SSE-NEXT: # BB#20:
@@ -4302,7 +4302,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_22
; SSE-NEXT: # BB#23:
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 560e5c568faf0..7a4326c01bb7d 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -4,7 +4,7 @@
define <2 x i64> @test(i64 %i) nounwind {
; CHECK-LABEL: test:
; CHECK: # BB#0:
-; CHECK-NEXT: movd %rdi, %xmm0
+; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: retq
%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index b08f96038ff14..994bc2b3056ed 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -10,7 +10,7 @@ define <2 x i64> @t1(i64 %x) nounwind {
;
; X64-LABEL: t1:
; X64: # BB#0:
-; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
ret <2 x i64> %tmp8
diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll
index 64c64c3925441..c13299b9cb385 100644
--- a/test/CodeGen/X86/vec_shift7.ll
+++ b/test/CodeGen/X86/vec_shift7.ll
@@ -17,7 +17,7 @@ define i64 @test1(<2 x i64> %a) {
;
; X64-LABEL: test1:
; X64: # BB#0: # %entry
-; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
entry:
%c = shl <2 x i64> %a, <i64 0, i64 2>
diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll
index 316df2780d16e..202b8f7786b80 100644
--- a/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/test/CodeGen/X86/vector-compare-all_of.ll
@@ -10,7 +10,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
@@ -46,7 +46,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: andpd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
@@ -285,7 +285,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
@@ -321,7 +321,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index 1d3db6495708f..043ba28e8fa40 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -10,7 +10,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
@@ -46,7 +46,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: orpd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
@@ -267,7 +267,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
@@ -303,7 +303,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 895bf5c0f02d1..2b5eb695f53ea 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -11,22 +11,22 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
; SSE2-NEXT: imulq %rcx
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $63, %rax
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
-; SSE2-NEXT: movd %rdx, %xmm1
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: imulq %rcx
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $63, %rax
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
-; SSE2-NEXT: movd %rdx, %xmm0
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -40,14 +40,14 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: shrq $63, %rax
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
-; SSE41-NEXT: movd %rdx, %xmm1
-; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movq %rdx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: imulq %rcx
; SSE41-NEXT: movq %rdx, %rax
; SSE41-NEXT: shrq $63, %rax
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
-; SSE41-NEXT: movd %rdx, %xmm0
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -275,7 +275,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_rem7_2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: imulq %rsi
@@ -286,9 +286,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: leaq (,%rdx,8), %rax
; SSE2-NEXT: subq %rdx, %rax
; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: movq %rcx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: imulq %rsi
; SSE2-NEXT: movq %rdx, %rax
@@ -298,7 +298,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: leaq (,%rdx,8), %rax
; SSE2-NEXT: subq %rdx, %rax
; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: movq %rcx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -316,8 +316,8 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: leaq (,%rdx,8), %rax
; SSE41-NEXT: subq %rdx, %rax
; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movd %rcx, %xmm1
-; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: imulq %rsi
; SSE41-NEXT: movq %rdx, %rax
@@ -327,7 +327,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: leaq (,%rdx,8), %rax
; SSE41-NEXT: subq %rdx, %rax
; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: movq %rcx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 1b35e2fdddae0..cd17fcf8c85b4 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -11,7 +11,7 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
@@ -19,16 +19,16 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: addq %rdx, %rcx
; SSE2-NEXT: shrq $2, %rcx
-; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: movq %rcx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: subq %rdx, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: addq %rdx, %rcx
; SSE2-NEXT: shrq $2, %rcx
-; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: movq %rcx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -43,15 +43,15 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: shrq %rcx
; SSE41-NEXT: addq %rdx, %rcx
; SSE41-NEXT: shrq $2, %rcx
-; SSE41-NEXT: movd %rcx, %xmm1
-; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: subq %rdx, %rcx
; SSE41-NEXT: shrq %rcx
; SSE41-NEXT: addq %rdx, %rcx
; SSE41-NEXT: shrq $2, %rcx
-; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: movq %rcx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -255,7 +255,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_rem7_2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
@@ -267,9 +267,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: leaq (,%rax,8), %rdx
; SSE2-NEXT: subq %rax, %rdx
; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: movq %rcx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rcx, %rax
@@ -280,7 +280,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: leaq (,%rax,8), %rdx
; SSE2-NEXT: subq %rax, %rdx
; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: movq %rcx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -299,8 +299,8 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: leaq (,%rax,8), %rdx
; SSE41-NEXT: subq %rax, %rdx
; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movd %rcx, %xmm1
-; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rcx, %rax
@@ -311,7 +311,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: leaq (,%rax,8), %rdx
; SSE41-NEXT: subq %rax, %rdx
; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: movq %rcx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index 9e11edcc29dc5..f1f795bf3cb03 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1579,7 +1579,7 @@ define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # BB#0:
; SSE-NEXT: movl $55, %eax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
@@ -1607,7 +1607,7 @@ define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # BB#0:
; SSE-NEXT: movl $55, %eax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64u:
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index 0718edf5a1433..f05588a2920c7 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
@@ -19,7 +19,6 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %x) {
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%sign = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%not = xor <16 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
ret <16 x i8> %not
@@ -37,7 +36,6 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %x) {
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%sign = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%not = xor <8 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
ret <8 x i16> %not
@@ -55,7 +53,6 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%sign = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
%not = xor <4 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %not
@@ -81,7 +78,6 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%sign = ashr <2 x i64> %x, <i64 63, i64 63>
%not = xor <2 x i64> %sign, <i64 -1, i64 -1>
ret <2 x i64> %not
@@ -91,23 +87,23 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
; SSE2-LABEL: test_strange_type:
; SSE2: # BB#0:
; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movd %rsi, %xmm0
+; SSE2-NEXT: movq %rsi, %xmm0
; SSE2-NEXT: notq %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: movq %rsi, %rdx
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_strange_type:
; SSE42: # BB#0:
; SSE42-NEXT: sarq $63, %rsi
-; SSE42-NEXT: movd %rsi, %xmm0
+; SSE42-NEXT: movq %rsi, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; SSE42-NEXT: pxor %xmm0, %xmm1
-; SSE42-NEXT: movd %xmm1, %rax
+; SSE42-NEXT: movq %xmm1, %rax
; SSE42-NEXT: pextrq $1, %xmm1, %rdx
; SSE42-NEXT: retq
;
@@ -132,7 +128,6 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: retq
-;
%sign = ashr <1 x i128> %x, <i128 127>
%not = xor <1 x i128> %sign, <i128 -1>
ret <1 x i128> %not
@@ -163,7 +158,6 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%sign = ashr <32 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%not = xor <32 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
ret <32 x i8> %not
@@ -193,7 +187,6 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%sign = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%not = xor <16 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
ret <16 x i16> %not
@@ -223,7 +216,6 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%sign = ashr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%not = xor <8 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
ret <8 x i32> %not
@@ -266,7 +258,6 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%sign = ashr <4 x i64> %x, <i64 63, i64 63, i64 63, i64 63>
%not = xor <4 x i64> %sign, <i64 -1, i64 -1, i64 -1, i64 -1>
ret <4 x i64> %not
@@ -284,7 +275,6 @@ define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) {
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
-;
%cmp = icmp eq <16 x i8> %a, %b
%zext = zext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %zext
@@ -314,7 +304,6 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%cmp = icmp eq <16 x i16> %a, %b
%zext = zext <16 x i1> %cmp to <16 x i16>
ret <16 x i16> %zext
@@ -332,7 +321,6 @@ define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%cmp = icmp eq <4 x i32> %a, %b
%zext = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %zext
@@ -375,7 +363,6 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%cmp = icmp eq <4 x i64> %a, %b
%zext = zext <4 x i1> %cmp to <4 x i64>
ret <4 x i64> %zext
@@ -406,7 +393,6 @@ define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%cmp = icmp sgt <32 x i8> %a, %b
%zext = zext <32 x i1> %cmp to <32 x i8>
ret <32 x i8> %zext
@@ -424,7 +410,6 @@ define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%cmp = icmp sgt <8 x i16> %a, %b
%zext = zext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %zext
@@ -454,7 +439,6 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX2-NEXT: retq
-;
%cmp = icmp sgt <8 x i32> %a, %b
%zext = zext <8 x i1> %cmp to <8 x i32>
ret <8 x i32> %zext
@@ -488,7 +472,6 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: retq
-;
%cmp = icmp sgt <2 x i64> %a, %b
%zext = zext <2 x i1> %cmp to <2 x i64>
ret <2 x i64> %zext
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index e9f1d1d8522b3..8cc1d8c765ac3 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -1207,10 +1207,10 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shlq $62, %rcx
; SSE-NEXT: sarq $63, %rcx
-; SSE-NEXT: movd %rcx, %xmm1
+; SSE-NEXT: movq %rcx, %xmm1
; SSE-NEXT: shlq $63, %rax
; SSE-NEXT: sarq $63, %rax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
@@ -1687,28 +1687,28 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movsbq 1(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movsbq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movsbq 3(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movq %rax, %xmm2
; SSE2-NEXT: movsbq 2(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movsbq 1(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movsbq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: movq %rax, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: movsbq 3(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movq %rax, %xmm2
; SSSE3-NEXT: movsbq 2(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
@@ -2038,48 +2038,48 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_sext_8i8_to_8i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movsbq 1(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movsbq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movsbq 3(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movq %rax, %xmm2
; SSE2-NEXT: movsbq 2(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: movsbq 5(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: movq %rax, %xmm3
; SSE2-NEXT: movsbq 4(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movq %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE2-NEXT: movsbq 7(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm4
+; SSE2-NEXT: movq %rax, %xmm4
; SSE2-NEXT: movsbq 6(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: movq %rax, %xmm3
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i8_to_8i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movsbq 1(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movsbq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: movq %rax, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: movsbq 3(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movq %rax, %xmm2
; SSSE3-NEXT: movsbq 2(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: movsbq 5(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: movq %rax, %xmm3
; SSSE3-NEXT: movsbq 4(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movq %rax, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSSE3-NEXT: movsbq 7(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm4
+; SSSE3-NEXT: movq %rax, %xmm4
; SSSE3-NEXT: movsbq 6(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: movq %rax, %xmm3
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; SSSE3-NEXT: retq
;
@@ -4542,28 +4542,28 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movswq 2(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movswq (%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movswq 6(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movq %rax, %xmm2
; SSE2-NEXT: movswq 4(%rdi), %rax
-; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movswq 2(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movswq (%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: movq %rax, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: movswq 6(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movq %rax, %xmm2
; SSSE3-NEXT: movswq 4(%rdi), %rax
-; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index d0ead653b203d..e38d3f9744852 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -986,7 +986,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
; SSE-LABEL: insert_reg_and_zero_v2i64:
; SSE: # BB#0:
-; SSE-NEXT: movd %rdi, %xmm0
+; SSE-NEXT: movq %rdi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_and_zero_v2i64:
@@ -1048,25 +1048,25 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
; SSE2-LABEL: insert_reg_lo_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %rdi, %xmm1
+; SSE2-NEXT: movq %rdi, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v2i64:
; SSE3: # BB#0:
-; SSE3-NEXT: movd %rdi, %xmm1
+; SSE3-NEXT: movq %rdi, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v2i64:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movd %rdi, %xmm1
+; SSSE3-NEXT: movq %rdi, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v2i64:
; SSE41: # BB#0:
-; SSE41-NEXT: movd %rdi, %xmm1
+; SSE41-NEXT: movq %rdi, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
@@ -1140,7 +1140,7 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
; SSE-LABEL: insert_reg_hi_v2i64:
; SSE: # BB#0:
-; SSE-NEXT: movd %rdi, %xmm1
+; SSE-NEXT: movq %rdi, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 3e9e980a19730..e9c0d0962ab3e 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2104,25 +2104,25 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
; SSE2-LABEL: insert_reg_lo_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %rdi, %xmm1
+; SSE2-NEXT: movq %rdi, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v4i32:
; SSE3: # BB#0:
-; SSE3-NEXT: movd %rdi, %xmm1
+; SSE3-NEXT: movq %rdi, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v4i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movd %rdi, %xmm1
+; SSSE3-NEXT: movq %rdi, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v4i32:
; SSE41: # BB#0:
-; SSE41-NEXT: movd %rdi, %xmm1
+; SSE41-NEXT: movq %rdi, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
@@ -2191,7 +2191,7 @@ define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
; SSE-LABEL: insert_reg_hi_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movd %rdi, %xmm1
+; SSE-NEXT: movq %rdi, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 1385929ab8cd3..202acbcd35007 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -879,3 +879,29 @@ define <32 x i8> @constant_fold_pshufb_256() {
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
ret <32 x i8> %1
}
+
+define <32 x i8> @PR27320(<8 x i32> %a0) {
+; X32-LABEL: PR27320:
+; X32: # BB#0:
+; X32-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm2
+; X32-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
+; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
+; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: PR27320:
+; X64: # BB#0:
+; X64-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm2
+; X64-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
+; X64-NEXT: vpor %xmm1, %xmm2, %xmm1
+; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
+; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
+ %2 = bitcast <8 x i32> %1 to <32 x i8>
+ %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
+ ret <32 x i8> %3
+}
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index ab34ad6a613cc..a5fac9ac6a41e 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -1257,7 +1257,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
; SSE: # BB#0:
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
; SSE-NEXT: psubq %xmm2, %xmm0
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
@@ -1301,7 +1301,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
; SSE: # BB#0:
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
+; SSE-NEXT: movq %rax, %xmm4
; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
; SSE-NEXT: psubq %xmm4, %xmm0
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
@@ -1418,7 +1418,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
; SSE: # BB#0:
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
; SSE-NEXT: psubq %xmm8, %xmm0
; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
@@ -2411,7 +2411,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-NEXT: psllq $32, %xmm1
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
@@ -2554,7 +2554,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
; SSE: # BB#0:
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm9
; SSE-NEXT: pmuludq %xmm8, %xmm9
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index d39a90b066f5e..58f7407eeec4e 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -906,7 +906,7 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) {
; SSE-LABEL: trunc2i64_i64:
; SSE: # BB#0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: trunc2i64_i64:
@@ -1031,19 +1031,19 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc4i32_i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc4i32_i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc4i32_i64:
@@ -1158,19 +1158,19 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i16_i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i16_i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc8i16_i64:
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 56f634c4188fd..22d0065b264fc 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1249,7 +1249,7 @@ define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
; SSE: # BB#0:
; SSE-NEXT: movl $8, %eax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
@@ -1271,7 +1271,7 @@ define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
; SSE: # BB#0:
; SSE-NEXT: movl $8, %eax
-; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64u:
diff --git a/test/CodeGen/X86/vmovq.ll b/test/CodeGen/X86/vmovq.ll
index 45d350c743e25..5c1ff7d06ee0b 100644
--- a/test/CodeGen/X86/vmovq.ll
+++ b/test/CodeGen/X86/vmovq.ll
@@ -6,7 +6,7 @@ define <2 x i64> @PR25554(<2 x i64> %v0, <2 x i64> %v1) {
; SSE-LABEL: PR25554:
; SSE: # BB#0:
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; SSE-NEXT: paddq %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index 7ad5706592e42..c9a34de123692 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -39,7 +39,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
;
; X64-LABEL: shift1b:
; X64: # BB#0: # %entry
-; X64-NEXT: movd %rsi, %xmm1
+; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index f79fc5bff9687..88cba8a4d6ac8 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -39,7 +39,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
;
; X64-LABEL: shift1b:
; X64: # BB#0: # %entry
-; X64-NEXT: movd %rsi, %xmm1
+; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: psrlq %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll
index e62698221973f..f844904c86905 100644
--- a/test/CodeGen/X86/vsplit-and.ll
+++ b/test/CodeGen/X86/vsplit-and.ll
@@ -23,13 +23,13 @@ define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind read
define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t2:
; CHECK: # BB#0:
-; CHECK-NEXT: movd %r9, %xmm1
-; CHECK-NEXT: movd %r8, %xmm0
+; CHECK-NEXT: movq %r9, %xmm1
+; CHECK-NEXT: movq %r8, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: movd %rdx, %xmm2
-; CHECK-NEXT: movd %rsi, %xmm1
+; CHECK-NEXT: movq %rdx, %xmm2
+; CHECK-NEXT: movq %rsi, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT: movd %rcx, %xmm2
+; CHECK-NEXT: movq %rcx, %xmm2
; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: pxor %xmm4, %xmm4
; CHECK-NEXT: pcmpeqq %xmm4, %xmm2
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index a973fdaa8d601..986fa4743c6c2 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -16,7 +16,7 @@ define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
;
; X64-LABEL: convert:
; X64: ## BB#0: ## %entry
-; X64-NEXT: movd %rsi, %xmm0
+; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: pxor {{.*}}(%rip), %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 504485440efff..3b20f3515716c 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -105,7 +105,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # BB#0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
-; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -129,7 +129,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE42: # BB#0: # %entry
; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: movq %rcx, %xmm0
; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
; X64-SSE42-NEXT: pslld $24, %xmm0
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index ef56692e947ce..6dc938893d384 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -130,7 +130,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # BB#0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
-; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -154,7 +154,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE42: # BB#0: # %entry
; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: movq %rcx, %xmm0
; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0