diff options
Diffstat (limited to 'test/CodeGen')
147 files changed, 10435 insertions, 932 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir new file mode 100644 index 0000000000000..96436209451b0 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir @@ -0,0 +1,65 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + define i32 @main() { + entry: + ret i32 0 + } + + declare i32 @printf(i8*, ...) +... +--- +# CHECK-LABEL: name: main +name: main +alignment: 2 +exposesReturnsTwice: false +noVRegs: false +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + - { id: 7, class: gpr } + - { id: 8, class: gpr } + - { id: 9, class: gpr } + - { id: 10, class: gpr } + - { id: 11, class: gpr } + - { id: 12, class: gpr } + - { id: 13, class: gpr } + - { id: 14, class: gpr } + - { id: 15, class: gpr } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: true + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +# CHECK: body: +# CHECK: %1 = COPY %w0 +# CHECK-NOT: %2 = ORNWrr %wzr, %1 +# CHECK: %4 = EONWrr %1, %3 +body: | + bb.1.entry: + liveins: %w0 + %0(s32) = G_CONSTANT i32 -1 + %3(s32) = G_CONSTANT i32 1 + %1(s32) = COPY %w0 + %2(s32) = G_XOR %1, %0 + %4(s32) = G_XOR %2, %3 + %w0 = COPY %4(s32) +... diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll index a5fa78abb92f4..a7668ec97979c 100644 --- a/test/CodeGen/AArch64/arm64-vmul.ll +++ b/test/CodeGen/AArch64/arm64-vmul.ll @@ -1201,35 +1201,35 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou ; Scalar FMULX define float @fmulxs(float %a, float %b) nounwind { ; CHECK-LABEL: fmulxs: -; CHECKNEXT: fmulx s0, s0, s1 +; CHECK-NEXT: fmulx s0, s0, s1 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd(double %a, double %b) nounwind { ; CHECK-LABEL: fmulxd: -; CHECKNEXT: fmulx d0, d0, d1 +; CHECK-NEXT: fmulx d0, d0, d1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret double %fmulx.i } define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { ; CHECK-LABEL: fmulxs_lane: -; CHECKNEXT: fmulx.s s0, s0, v1[3] +; CHECK-NEXT: fmulx.s s0, s0, v1[3] %b = extractelement <4 x float> %vec, i32 3 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { ; CHECK-LABEL: fmulxd_lane: -; CHECKNEXT: fmulx d0, d0, v1[1] +; CHECK-NEXT: fmulx.d d0, d0, v1[1] %b = extractelement <2 x double> %vec, i32 1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret double %fmulx.i } diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll new file mode 100644 index 0000000000000..2ed744277385a --- /dev/null +++ b/test/CodeGen/AArch64/fence-singlethread.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=LINUX +; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=IOS +; RUN: llc -mtriple=aarch64-linux-gnueabihf %s -filetype=obj -o %t +; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ + +; OBJ-NOT: dmb + +define void @fence_singlethread() { +; LINUX-LABEL: fence_singlethread: +; LINUX-NOT: dmb +; LINUX: // COMPILER BARRIER +; LINUX-NOT: dmb + +; IOS-LABEL: fence_singlethread: +; IOS-NOT: dmb +; IOS: ; COMPILER BARRIER +; IOS-NOT: dmb + + fence singlethread seq_cst + ret void +} diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll new file mode 100644 index 0000000000000..a4725c65aa26f --- /dev/null +++ b/test/CodeGen/AArch64/optimize-imm.ll @@ -0,0 +1,64 @@ +; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s + +; CHECK-LABEL: and1: +; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd + +define void @and1(i32 %a, i8* nocapture %p) { +entry: + %and = and i32 %a, 253 + %conv = trunc i32 %and to i8 + store i8 %conv, i8* %p, align 1 + ret void +} + +; (a & 0x3dfd) | 0xffffc000 +; +; CHECK-LABEL: and2: +; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd + +define i32 @and2(i32 %a) { +entry: + %and = and i32 %a, 15869 + %or = or i32 %and, -16384 + ret i32 %or +} + +; (a & 0x19) | 0xffffffc0 +; +; CHECK-LABEL: and3: +; CHECK: and {{w[0-9]+}}, w0, #0x99999999 + +define i32 @and3(i32 %a) { +entry: + %and = and i32 %a, 25 + %or = or i32 %and, -64 + ret i32 %or +} + +; (a & 0xc5600) | 0xfff1f1ff +; +; CHECK-LABEL: and4: +; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff + +define i32 @and4(i32 %a) { +entry: + %and = and i32 %a, 787968 + %or = or i32 %and, -921089 + ret i32 %or +} + +; Make sure we don't shrink or optimize an XOR's immediate operand if the +; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into +; a BIC. + +; CHECK-LABEL: xor1: +; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 + +define i32 @xor1(i32 %a) { +entry: + %shl = shl i32 %a, 3 + %xor = and i32 %shl, 56 + %and = xor i32 %xor, 56 + ret i32 %and +} diff --git a/test/CodeGen/AArch64/swiftself-scavenger.ll b/test/CodeGen/AArch64/swiftself-scavenger.ll new file mode 100644 index 0000000000000..6d02784409317 --- /dev/null +++ b/test/CodeGen/AArch64/swiftself-scavenger.ll @@ -0,0 +1,82 @@ +; RUN: llc -o - %s | FileCheck %s +; Check that we reserve an emergency spill slot, even if we added an extra +; CSR spill for the values used by the swiftself parameter. +; CHECK-LABEL: func: +; CHECK: str [[REG:x[0-9]+]], [sp, #8] +; CHECK: add [[REG]], sp, #248 +; CHECK: str xzr, [{{\s*}}[[REG]], #32760] +; CHECK: ldr x30, [sp, #8] +target triple = "arm64-apple-ios" + +@ptr8 = external global i8* +@ptr64 = external global i64 + +define hidden swiftcc void @func(i8* swiftself %arg) #0 { +bb: + %stack0 = alloca i8*, i32 5000, align 8 + %stack1 = alloca i8*, i32 32, align 8 + + %v0 = load volatile i64, i64* @ptr64, align 8 + %v1 = load volatile i64, i64* @ptr64, align 8 + %v2 = load volatile i64, i64* @ptr64, align 8 + %v3 = load volatile i64, i64* @ptr64, align 8 + %v4 = load volatile i64, i64* @ptr64, align 8 + %v5 = load volatile i64, i64* @ptr64, align 8 + %v6 = load volatile i64, i64* @ptr64, align 8 + %v7 = load volatile i64, i64* @ptr64, align 8 + %v8 = load volatile i64, i64* @ptr64, align 8 + %v9 = load volatile i64, i64* @ptr64, align 8 + %v10 = load volatile i64, i64* @ptr64, align 8 + %v11 = load volatile i64, i64* @ptr64, align 8 + %v12 = load volatile i64, i64* @ptr64, align 8 + %v13 = load volatile i64, i64* @ptr64, align 8 + %v14 = load volatile i64, i64* @ptr64, align 8 + %v15 = load volatile i64, i64* @ptr64, align 8 + %v16 = load volatile i64, i64* @ptr64, align 8 + %v17 = load volatile i64, i64* @ptr64, align 8 + %v18 = load volatile i64, i64* @ptr64, align 8 + %v19 = load volatile i64, i64* @ptr64, align 8 + %v20 = load volatile i64, i64* @ptr64, align 8 + %v21 = load volatile i64, i64* @ptr64, align 8 + %v22 = load volatile i64, i64* @ptr64, align 8 + %v23 = load volatile i64, i64* @ptr64, align 8 + %v24 = load volatile i64, i64* @ptr64, align 8 + %v25 = load volatile i64, i64* @ptr64, align 8 + + ; this should exceed stack-relative addressing limits and need an emergency + ; spill slot. + %s = getelementptr inbounds i8*, i8** %stack0, i64 4092 + store volatile i8* null, i8** %s + store volatile i8* null, i8** %stack1 + + store volatile i64 %v0, i64* @ptr64, align 8 + store volatile i64 %v1, i64* @ptr64, align 8 + store volatile i64 %v2, i64* @ptr64, align 8 + store volatile i64 %v3, i64* @ptr64, align 8 + store volatile i64 %v4, i64* @ptr64, align 8 + store volatile i64 %v5, i64* @ptr64, align 8 + store volatile i64 %v6, i64* @ptr64, align 8 + store volatile i64 %v7, i64* @ptr64, align 8 + store volatile i64 %v8, i64* @ptr64, align 8 + store volatile i64 %v9, i64* @ptr64, align 8 + store volatile i64 %v10, i64* @ptr64, align 8 + store volatile i64 %v11, i64* @ptr64, align 8 + store volatile i64 %v12, i64* @ptr64, align 8 + store volatile i64 %v13, i64* @ptr64, align 8 + store volatile i64 %v14, i64* @ptr64, align 8 + store volatile i64 %v15, i64* @ptr64, align 8 + store volatile i64 %v16, i64* @ptr64, align 8 + store volatile i64 %v17, i64* @ptr64, align 8 + store volatile i64 %v18, i64* @ptr64, align 8 + store volatile i64 %v19, i64* @ptr64, align 8 + store volatile i64 %v20, i64* @ptr64, align 8 + store volatile i64 %v21, i64* @ptr64, align 8 + store volatile i64 %v22, i64* @ptr64, align 8 + store volatile i64 %v23, i64* @ptr64, align 8 + store volatile i64 %v24, i64* @ptr64, align 8 + store volatile i64 %v25, i64* @ptr64, align 8 + + ; use swiftself parameter late so it stays alive throughout the function. + store volatile i8* %arg, i8** @ptr8 + ret void +} diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index e137ef4bc2367..73e80d523f1e2 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index 6ec93c72ec527..b1e71722d80c5 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 @@ -223,9 +223,8 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { } ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: -; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} -; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} +; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* store volatile i32 7, i32* %cast diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll index 96a5e3b23758a..7f424ef2a1477 100644 --- a/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/code-object-metadata-images.ll b/test/CodeGen/AMDGPU/code-object-metadata-images.ll new file mode 100644 index 0000000000000..918560469852b --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-images.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +%opencl.image1d_t = type opaque +%opencl.image1d_array_t = type opaque +%opencl.image1d_buffer_t = type opaque +%opencl.image2d_t = type opaque +%opencl.image2d_array_t = type opaque +%opencl.image2d_array_depth_t = type opaque +%opencl.image2d_array_msaa_t = type opaque +%opencl.image2d_array_msaa_depth_t = type opaque +%opencl.image2d_depth_t = type opaque +%opencl.image2d_msaa_t = type opaque +%opencl.image2d_msaa_depth_t = type opaque +%opencl.image3d_t = type opaque + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] + +; CHECK: Kernels: +; CHECK: - Name: test +; CHECK: Args: +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_array_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_buffer_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_msaa_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_msaa_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_msaa_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_msaa_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image3d_t +define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a, + %opencl.image1d_array_t addrspace(1)* %b, + %opencl.image1d_buffer_t addrspace(1)* %c, + %opencl.image2d_t addrspace(1)* %d, + %opencl.image2d_array_t addrspace(1)* %e, + %opencl.image2d_array_depth_t addrspace(1)* %f, + %opencl.image2d_array_msaa_t addrspace(1)* %g, + %opencl.image2d_array_msaa_depth_t addrspace(1)* %h, + %opencl.image2d_depth_t addrspace(1)* %i, + %opencl.image2d_msaa_t addrspace(1)* %j, + %opencl.image2d_msaa_depth_t addrspace(1)* %k, + %opencl.image3d_t addrspace(1)* %l) + !kernel_arg_type !1 !kernel_arg_base_type !1 { + ret void +} + +!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t", + !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t", + !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t", + !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t", + !"image3d_t"} diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index f2686a5582dc6..c9787bb478ef2 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index b3a2b66437207..738a5adba14fb 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -85,10 +85,20 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; GCN-NOT: s_setreg +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN-NOT: s_setreg +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b diff --git a/test/CodeGen/AMDGPU/fence-amdgiz.ll b/test/CodeGen/AMDGPU/fence-amdgiz.ll new file mode 100644 index 0000000000000..df675c9a8692e --- /dev/null +++ b/test/CodeGen/AMDGPU/fence-amdgiz.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn-amd-amdhsa-amdgizcl" + +; CHECK_LABEL: atomic_fence +; CHECK: BB#0: +; CHECK: ATOMIC_FENCE 4, 1 +; CHECK: s_endpgm + +define amdgpu_kernel void @atomic_fence() { + fence acquire + ret void +} + diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll index bdd3c04fd3189..624610096cbc5 100644 --- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 555764c15519e..506b2a02f8281 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll index 85ad365d02a89..c15a30e3c5401 100644 --- a/test/CodeGen/AMDGPU/immv216.ll +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; FIXME: Merge into imm.ll diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index a3f82b8a01174..89adcff1a2787 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -216,7 +216,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] -; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000{{$}} ; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]] ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 5e892fad3741b..cbd8f0a9c23a3 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s ret void } +; CHECK-LABEL: {{^}}test_readlane_vregs: +; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} +; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]] +define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in + %value = extractelement <2 x i32> %args, i32 0 + %lane = extractelement <2 x i32> %args, i32 1 + %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane) + store i32 %readlane, i32 addrspace(1)* %out, align 4 + ret void +} + ; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_readlane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 @@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) ret void } +declare i32 @llvm.amdgcn.workitem.id.x() #2 + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } +attributes #2 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll new file mode 100644 index 0000000000000..bafafa33016fa --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll @@ -0,0 +1,9 @@ +; RUN: llc -march amdgcn %s -filetype=obj -o /dev/null +; RUN: llc -march amdgcn <%s | FileCheck %s +define amdgpu_kernel void @f() { + ; CHECK: ; divergent unreachable + call void @llvm.amdgcn.unreachable() + ret void +} + +declare void @llvm.amdgcn.unreachable() diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll index 84c42e8bd1e06..b9df2cb779ad0 100644 --- a/test/CodeGen/AMDGPU/loop_break.ll +++ b/test/CodeGen/AMDGPU/loop_break.ll @@ -10,7 +10,7 @@ ; OPT: bb4: ; OPT: load volatile -; OPT: %cmp1 = icmp sge i32 %tmp, %load +; OPT: xor i1 %cmp1 ; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: br label %Flow diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll index e21d0d09bb415..6a90a7a9f2eb3 100644 --- a/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir new file mode 100644 index 0000000000000..064db49924e15 --- /dev/null +++ b/test/CodeGen/AMDGPU/merge-m0.mir @@ -0,0 +1,132 @@ +# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s + +# GCN: bb.0.entry: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.1: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.2: +# GCN: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.3: +# GCN: SI_INIT_M0 3 + +# GCN: bb.4: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.5: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.6: +# GCN: SI_INIT_M0 -1, +# GCN-NEXT: DS_WRITE_B32 +# GCN: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 + +--- +name: test +alignment: 0 +exposesReturnsTwice: false +noVRegs: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_32_xm0 } +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.1, implicit undef %vcc + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4, %bb.5 + S_CBRANCH_VCCZ %bb.4, implicit undef %vcc + S_BRANCH %bb.5 + + bb.4: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.5: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.6: + successors: %bb.0.entry, %bb.6 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + %2 = IMPLICIT_DEF + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.6, implicit undef %vcc + S_BRANCH %bb.0.entry + +... diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll new file mode 100644 index 0000000000000..3a0605fa182a3 --- /dev/null +++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -0,0 +1,136 @@ +; RUN: llc -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; Test addressing modes when the scratch base is not a frame index. + +; GCN-LABEL: {{^}}store_private_offset_i8: +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i8() #0 { + store volatile i8 5, i8* inttoptr (i32 8 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i16: +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i16() #0 { + store volatile i16 5, i16* inttoptr (i32 8 to i16*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i32: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i32() #0 { + store volatile i32 5, i32* inttoptr (i32 8 to i32*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_v2i32: +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_v2i32() #0 { + store volatile <2 x i32> <i32 5, i32 10>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_v4i32: +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_v4i32() #0 { + store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i8: +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i8() #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + ret void +} + +; GCN-LABEL: {{^}}sextload_private_offset_i8: +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %sextload = sext i8 %load to i32 + store i32 %sextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}zextload_private_offset_i8: +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %zextload = zext i8 %load to i32 + store i32 %zextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i16: +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i16() #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + ret void +} + +; GCN-LABEL: {{^}}sextload_private_offset_i16: +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %sextload = sext i16 %load to i32 + store i32 %sextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}zextload_private_offset_i16: +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %zextload = zext i16 %load to i32 + store i32 %zextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i32: +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i32() #0 { + %load = load volatile i32, i32* inttoptr (i32 8 to i32*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_v2i32: +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_v2i32() #0 { + %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_v4i32: +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_v4i32() #0 { + %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095 +define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { + store volatile i8 5, i8* inttoptr (i32 4095 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}} +define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { + store volatile i8 5, i8* inttoptr (i32 4096 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}} +define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { + store volatile i8 5, i8* inttoptr (i32 4097 to i8*) + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 4bd8bff4809af..9d0b6b395996b 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -9,19 +9,18 @@ ; StructurizeCFG. ; IR-LABEL: @multi_divergent_region_exit_ret_ret( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: %1 = extractvalue { i1, i64 } %0, 0 -; IR: %2 = extractvalue { i1, i64 } %0, 1 -; IR: br i1 %1, label %LeafBlock1, label %Flow +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %2 = extractvalue { i1, i64 } %1, 0 +; IR: %3 = extractvalue { i1, i64 } %1, 1 +; IR: br i1 %2, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %6 = extractvalue { i1, i64 } %5, 0 -; IR: %7 = extractvalue { i1, i64 } %5, 1 -; IR: br i1 %6, label %LeafBlock, label %Flow1 +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %7 = extractvalue { i1, i64 } %6, 0 +; IR: %8 = extractvalue { i1, i64 } %6, 1 +; IR: br i1 %7, label %LeafBlock, label %Flow1 ; IR: LeafBlock: ; IR: br label %Flow1 @@ -30,32 +29,32 @@ ; IR: br label %Flow{{$}} ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: %10 = extractvalue { i1, i64 } [[IF]], 0 -; IR: %11 = extractvalue { i1, i64 } [[IF]], 1 -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %13 = extractvalue { i1, i64 } %12, 0 +; IR: %14 = extractvalue { i1, i64 } %12, 1 +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %11) +; IR: call void @llvm.amdgcn.end.cf(i64 %14) ; IR: ret void @@ -65,9 +64,11 @@ ; GCN: s_xor_b64 -; GCN: ; %LeafBlock -; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; FIXME: Why is this compare essentially repeated? +; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec @@ -125,15 +126,14 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock ; IR: UnifiedUnreachableBlock: @@ -181,49 +181,51 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( -; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2 +; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 ; IR: llvm.amdgcn.if ; IR: br i1 ; IR: {{^}}Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: br i1 %6, label %LeafBlock, label %Flow1 +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: br i1 %7, label %LeafBlock, label %Flow1 ; IR: {{^}}LeafBlock: -; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1 +; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 +; IR: %9 = xor i1 %divergent.cond1, true ; IR: br label %Flow1 ; IR: LeafBlock1: -; IR: %uniform.cond0 = icmp ne i32 %arg3, 2 +; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 +; IR: %10 = xor i1 %uniform.cond0, true ; IR: br label %Flow ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: {{^}}Flow1: -; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %11) +; IR: call void @llvm.amdgcn.end.cf(i64 %14) ; IR: ret void define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -262,18 +264,17 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: br i1 %1, label %LeafBlock1, label %Flow +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: br i1 %2, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -313,13 +314,13 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( ; IR: Flow2: -; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] -; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %17) +; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %20) ; IR: UnifiedReturnBlock: -; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %12) +; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %15) ; IR: ret float %UnifiedRetVal define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { entry: @@ -386,32 +387,31 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef ; IR-NEXT: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef @@ -419,7 +419,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) ; IR-NEXT: ret void define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -475,7 +475,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) ; IR-NEXT: ret void define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -622,15 +622,15 @@ uniform.ret: ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region -; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] -; IR: br i1 %6, label %uniform.if, label %Flow2 +; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %8, label %uniform.if, label %Flow2 ; IR: Flow: ; preds = %uniform.then, %uniform.if -; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ] -; IR: br i1 %7, label %uniform.endif, label %uniform.ret0 +; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] +; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) ; IR-NEXT: ret void define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { entry: diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll index c0b4eaff60aac..672549c8ea636 100644 --- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -133,9 +133,9 @@ bb23: ; preds = %bb10 ; IR: Flow1: ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ] -; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ] +; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ] ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ] ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi) ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) @@ -144,9 +144,9 @@ bb23: ; preds = %bb10 ; IR: Flow2: ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ] -; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ] ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23) @@ -156,15 +156,16 @@ bb23: ; preds = %bb10 ; IR: bb21: ; IR: %tmp12 = icmp slt i32 %tmp11, 9 -; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken) +; IR-NEXT: %27 = xor i1 %tmp12, true +; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken) ; IR-NEXT: br label %Flow3 ; IR: Flow3: ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ] -; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ] -; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] -; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] -; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ] +; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ] +; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] +; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] +; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26) ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4 diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll index af26835102938..dcb089010e99d 100644 --- a/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=OPTNONE %s ; There are no stack objects, but still a private memory access. The ; private access regiters need to be correctly initialized anyway, and @@ -27,9 +27,9 @@ define amdgpu_kernel void @store_to_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { - store volatile i32 0, i32* inttoptr (i32 123 to i32*) + store volatile i32 0, i32* inttoptr (i32 124 to i32*) ret void } @@ -47,9 +47,9 @@ define amdgpu_kernel void @load_from_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { - %ld = load volatile i32, i32* inttoptr (i32 123 to i32*) + %ld = load volatile i32, i32* inttoptr (i32 124 to i32*) ret void } diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll index 5c698c839fa68..d7b353cd25d38 100644 --- a/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -22,4 +22,18 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { ret void } +; This test used to crash in ScheduleDAG. +; +; GCN-LABEL: {{^}}test_readcyclecounter_smem: +; SI-DAG: s_memtime +; VI-DAG: s_memrealtime +; GCN-DAG: s_load_dword +define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(2)* inreg %in) #0 { + %cycle0 = call i64 @llvm.readcyclecounter() + %in.v = load i64, i64 addrspace(2)* %in + %r.64 = add i64 %cycle0, %in.v + %r.32 = trunc i64 %r.64 to i32 + ret i32 %r.32 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index 748f98a12c591..f2fbacbab82e7 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -56,7 +56,7 @@ ret.bb: ; preds = %else, %main_body } ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: -; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index b702e1c07200d..160fb6a038fed 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll index eac29bad7cf23..115221c5316dc 100644 --- a/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 4e093cdece212..16ce86bf8b115 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 0e715c453209e..8f1aebfe9ceb8 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -69,19 +69,20 @@ endif: ; TOSMEM-NOT: s_m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload @@ -130,7 +131,7 @@ endif: ; preds = %else, %if ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 @@ -159,13 +160,14 @@ endif: ; GCN-LABEL: {{^}}restore_m0_lds: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 ; TOSMEM: s_mov_b32 m0, -1 @@ -178,10 +180,10 @@ endif: ; TOSMEM: ds_write_b64 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s0 diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index 69f0accef6282..431344670ffb1 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s ; FIXME: Need to handle non-uniform case for function below (load without gep). diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll index 77ad895d0e86a..51771c9723e00 100644 --- a/test/CodeGen/AMDGPU/trap.ll +++ b/test/CodeGen/AMDGPU/trap.ll @@ -80,4 +80,25 @@ define amdgpu_kernel void @trap() { ret void } +; GCN-LABEL: {{^}}non_entry_trap: +; TRAP-BIT: enable_trap_handler = 1 +; NO-TRAP-BIT: enable_trap_handler = 0 + +; HSA: BB{{[0-9]_[0-9]+]]: ; %trap +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-NEXT: s_trap 2 +define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr #1 { +entry: + %tmp29 = load volatile i32, i32 addrspace(1)* %arg0 + %cmp = icmp eq i32 %tmp29, -1 + br i1 %cmp, label %ret, label %trap + +trap: + call void @llvm.trap() + unreachable + +ret: + ret void +} + attributes #0 = { nounwind noreturn } diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 21c774133f896..83ab2659ef4aa 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -5,6 +5,8 @@ define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_trunc_s32_16() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } @@ -21,6 +23,9 @@ define void @test_mul_s32() #1 { ret void } define void @test_mulv5_s32() { ret void } + define void @test_sdiv_s32() #2 { ret void } + define void @test_udiv_s32() #2 { ret void } + define void @test_load_from_stack() { ret void } define void @test_load_f32() #0 { ret void } define void @test_load_f64() #0 { ret void } @@ -28,12 +33,14 @@ define void @test_stores() #0 { ret void } define void @test_gep() { ret void } - define void @test_constants() { ret void } + define void @test_constant_imm() { ret void } + define void @test_constant_cimm() { ret void } define void @test_soft_fp_double() #0 { ret void } attributes #0 = { "target-features"="+vfp2,-neonfp" } attributes #1 = { "target-features"="+v6" } + attributes #2 = { "target-features"="+hwdiv-arm" } ... --- name: test_zext_s1 @@ -142,6 +149,34 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_trunc_s32_16 +# CHECK-LABEL: name: test_trunc_s32_16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s16) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s16) + ; CHECK: %r0 = COPY [[VREGTRUNC]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_add_s8 # CHECK-LABEL: name: test_add_s8 legalized: true @@ -538,6 +573,72 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_sdiv_s32 +# CHECK-LABEL: name: test_sdiv_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gpr +# CHECK: id: 1, class: gpr +# CHECK: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_SDIV %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = SDIV [[VREGX]], [[VREGY]], 14, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_s32 +# CHECK-LABEL: name: test_udiv_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gpr +# CHECK: id: 1, class: gpr +# CHECK: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_UDIV %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = UDIV [[VREGX]], [[VREGY]], 14, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_load_from_stack # CHECK-LABEL: name: test_load_from_stack legalized: true @@ -714,8 +815,8 @@ body: | BX_RET 14, _, implicit %r0 ... --- -name: test_constants -# CHECK-LABEL: name: test_constants +name: test_constant_imm +# CHECK-LABEL: name: test_constant_imm legalized: true regBankSelected: true selected: false @@ -732,6 +833,26 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_constant_cimm +# CHECK-LABEL: name: test_constant_cimm +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +# CHECK: id: [[C:[0-9]+]], class: gpr +body: | + bb.0: + ; Adding a type on G_CONSTANT changes its operand from an Imm into a CImm. + ; We still want to see the same thing in the output though. + %0(s32) = G_CONSTANT i32 42 + ; CHECK: %[[C]] = MOVi 42, 14, _, _ + + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 +... +--- name: test_soft_fp_double # CHECK-LABEL: name: test_soft_fp_double legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll new file mode 100644 index 0000000000000..2881740b016fd --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll @@ -0,0 +1,68 @@ +; We use V6 ops so we can easily check for the extensions (sxth vs bit tricks). +; RUN: llc -mtriple arm-gnueabi -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +; RUN: llc -mtriple arm-gnueabi -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-AEABI +; RUN: llc -mtriple arm-gnu -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +; RUN: llc -mtriple arm-gnu -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-DEFAULT + +define arm_aapcscc i32 @test_sdiv_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_sdiv_i32: +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i32 %a, %b + ret i32 %r +} + +define arm_aapcscc i32 @test_udiv_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_udiv_i32: +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i32 %a, %b + ret i32 %r +} + +define arm_aapcscc i16 @test_sdiv_i16(i16 %a, i16 %b) { +; CHECK-LABEL: test_sdiv_i16: +; CHECK-DAG: sxth r0, r0 +; CHECK-DAG: sxth r1, r1 +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i16 %a, %b + ret i16 %r +} + +define arm_aapcscc i16 @test_udiv_i16(i16 %a, i16 %b) { +; CHECK-LABEL: test_udiv_i16: +; CHECK-DAG: uxth r0, r0 +; CHECK-DAG: uxth r1, r1 +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i16 %a, %b + ret i16 %r +} + +define arm_aapcscc i8 @test_sdiv_i8(i8 %a, i8 %b) { +; CHECK-LABEL: test_sdiv_i8: +; CHECK-DAG: sxtb r0, r0 +; CHECK-DAG: sxtb r1, r1 +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i8 %a, %b + ret i8 %r +} + +define arm_aapcscc i8 @test_udiv_i8(i8 %a, i8 %b) { +; CHECK-LABEL: test_udiv_i8: +; CHECK-DAG: uxtb r0, r0 +; CHECK-DAG: uxtb r1, r1 +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i8 %a, %b + ret i8 %r +} + diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll index f3ca2915f306e..da02bfe68519d 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -7,6 +7,14 @@ entry: ret void } +define i32 @test_constant_return_i32() { +; CHECK-LABEL: test_constant_return_i32: +; CHECK: mov r0, #42 +; CHECK: bx lr +entry: + ret i32 42 +} + define zeroext i1 @test_zext_i1(i1 %x) { ; CHECK-LABEL: test_zext_i1 ; CHECK: and r0, r0, #1 @@ -40,6 +48,30 @@ entry: ret i16 %x } +define void @test_trunc_i32_i16(i32 %v, i16 *%p) { +; CHECK-LABEL: test_trunc_i32_i16: +; The trunc doesn't result in any instructions, but we +; expect the store to be explicitly 16-bit. +; CHECK: strh r0, [r1] +; CHECK: bx lr +entry: + %v16 = trunc i32 %v to i16 + store i16 %v16, i16 *%p + ret void +} + +define void @test_trunc_i32_i8(i32 %v, i8 *%p) { +; CHECK-LABEL: test_trunc_i32_i8: +; The trunc doesn't result in any instructions, but we +; expect the store to be explicitly 8-bit. +; CHECK: strb r0, [r1] +; CHECK: bx lr +entry: + %v8 = trunc i32 %v to i8 + store i8 %v8, i8 *%p + ret void +} + define i8 @test_add_i8(i8 %x, i8 %y) { ; CHECK-LABEL: test_add_i8: ; CHECK: add r0, r0, r1 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir new file mode 100644 index 0000000000000..6f3e09d328cfe --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir @@ -0,0 +1,230 @@ +# RUN: llc -mtriple arm-linux-gnueabi -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +# RUN: llc -mtriple arm-linux-gnueabi -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-AEABI +# RUN: llc -mtriple arm-linux-gnu -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +# RUN: llc -mtriple arm-linux-gnu -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-DEFAULT +--- | + define void @test_sdiv_i32() { ret void } + define void @test_udiv_i32() { ret void } + + define void @test_sdiv_i16() { ret void } + define void @test_udiv_i16() { ret void } + + define void @test_sdiv_i8() { ret void } + define void @test_udiv_i8() { ret void } +... +--- +name: test_sdiv_i32 +# CHECK-LABEL: name: test_sdiv_i32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + %2(s32) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i32 +# CHECK-LABEL: name: test_udiv_i32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + %2(s32) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_sdiv_i16 +# CHECK-LABEL: name: test_sdiv_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + %2(s16) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i16 +# CHECK-LABEL: name: test_udiv_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + %2(s16) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_sdiv_i8 +# CHECK-LABEL: name: test_sdiv_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + %2(s8) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i8 +# CHECK-LABEL: name: test_udiv_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + %2(s8) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index e7935832f98a8..4e94fb4e34819 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -13,6 +13,9 @@ define void @test_mul_s16() { ret void } define void @test_mul_s8() { ret void } + define void @test_sdiv_s32() #1 { ret void } + define void @test_udiv_s32() #1 { ret void } + define void @test_loads() #0 { ret void } define void @test_stores() #0 { ret void } @@ -22,12 +25,15 @@ define void @test_constants() { ret void } + define void @test_trunc_s32_16() { ret void } + define void @test_fadd_s32() #0 { ret void } define void @test_fadd_s64() #0 { ret void } define void @test_soft_fp_s64() #0 { ret void } attributes #0 = { "target-features"="+vfp2"} + attributes #1 = { "target-features"="+hwdiv-arm" } ... --- name: test_add_s32 @@ -290,6 +296,58 @@ body: | ... --- +name: test_sdiv_s32 +# CHECK-LABEL: name: test_sdiv_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_SDIV %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_udiv_s32 +# CHECK-LABEL: name: test_udiv_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_UDIV %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- name: test_loads # CHECK-LABEL: name: test_loads legalized: true @@ -442,6 +500,27 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_trunc_s32_16 +# CHECK-LABEL: name: test_trunc_s32_16 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(s16) = G_TRUNC %0(s32) + %r0 = COPY %1(s16) + BX_RET 14, _, implicit %r0 +... +--- name: test_fadd_s32 # CHECK-LABEL: name: test_fadd_s32 legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll new file mode 100644 index 0000000000000..e3680ed2b9298 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple arm-unknown -verify-machineinstrs -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2>&1 | FileCheck %s + +; This file checks that we use the fallback path for things that are known to +; be unsupported on the ARM target. It should progressively shrink in size. + +define <4 x i32> @test_int_vectors(<4 x i32> %a, <4 x i32> %b) { +; CHECK: remark: {{.*}} unable to lower arguments: <4 x i32> (<4 x i32>, <4 x i32>)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_int_vectors + %res = add <4 x i32> %a, %b + ret <4 x i32> %res +} + +define <4 x float> @test_float_vectors(<4 x float> %a, <4 x float> %b) { +; CHECK: remark: {{.*}} unable to lower arguments: <4 x float> (<4 x float>, <4 x float>)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_float_vectors + %res = fadd <4 x float> %a, %b + ret <4 x float> %res +} + +define i64 @test_i64(i64 %a, i64 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i64 (i64, i64)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_i64 + %res = add i64 %a, %b + ret i64 %res +} + +define i128 @test_i128(i128 %a, i128 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i128 (i128, i128)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_i128 + %res = add i128 %a, %b + ret i128 %res +} + +define i17 @test_funny_ints(i17 %a, i17 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i17 (i17, i17)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_funny_ints + %res = add i17 %a, %b + ret i17 %res +} + +define half @test_half(half %a, half %b) { +; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_half + %res = fadd half %a, %b + ret half %res +} + +; On ARM, clang lowers structs to arrays. +define void @test_arrays([2 x i32] %this.could.come.from.a.struct) { +; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays + ret void +} + +define void @test_structs({i32, i32} %struct) { +; CHECK: remark: {{.*}} unable to lower arguments: void ({ i32, i32 })* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_structs + ret void +} + +define void @test_vararg_definition(i32 %a, ...) { +; CHECK: remark: {{.*}} unable to lower arguments: void (i32, ...)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_definition + ret void +} + +define void @test_vararg_call(i32 %a) { +; CHECK: remark: {{.*}} unable to translate instruction: call +; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_call + call void(i32, ...) @test_vararg_definition(i32 %a, i32 %a, i32 %a) + ret void +} + +define i32 @test_thumb(i32 %a) #0 { +; CHECK: remark: {{.*}} unable to lower arguments: i32 (i32)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_thumb + ret i32 %a +} + +attributes #0 = { "target-features"="+thumb-mode" } diff --git a/test/CodeGen/ARM/bool-ext-inc.ll b/test/CodeGen/ARM/bool-ext-inc.ll index fe43f1b2ef93d..b91b9b258991f 100644 --- a/test/CodeGen/ARM/bool-ext-inc.ll +++ b/test/CodeGen/ARM/bool-ext-inc.ll @@ -30,3 +30,42 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) { ret <4 x i32> %add } +define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: cmpgt_sext_inc_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vmov.i32 q10, #0x1 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vcgt.s32 q8, q9, q8 +; CHECK-NEXT: vadd.i32 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %cmp = icmp sgt <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +} + +define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: cmpne_sext_inc_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vceq.i32 q8, q9, q8 +; CHECK-NEXT: vmov.i32 q9, #0x1 +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vadd.i32 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %cmp = icmp ne <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +} + diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll new file mode 100644 index 0000000000000..ec032ccac423c --- /dev/null +++ b/test/CodeGen/ARM/fence-singlethread.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -filetype=obj -o %t +; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ + +; OBJ-NOT: dmb + +define void @fence_singlethread() { +; CHECK-LABEL: fence_singlethread: +; CHECK-NOT: dmb +; CHECK: @ COMPILER BARRIER +; CHECK-NOT: dmb + + fence singlethread seq_cst + ret void +} diff --git a/test/CodeGen/ARM/v6m-smul-with-overflow.ll b/test/CodeGen/ARM/v6m-smul-with-overflow.ll new file mode 100644 index 0000000000000..6e8a7041de2b9 --- /dev/null +++ b/test/CodeGen/ARM/v6m-smul-with-overflow.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s + +define i1 @signed_multiplication_did_overflow(i32, i32) { +; CHECK-LABEL: signed_multiplication_did_overflow: +entry-block: + %2 = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %0, i32 %1) + %3 = extractvalue { i32, i1 } %2, 1 + ret i1 %3 + +; CHECK: mov r2, r1 +; CHECK: asrs r1, r0, #31 +; CHECK: asrs r3, r2, #31 +; CHECK: bl __aeabi_lmul +} + +declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll index 1aa23597cf499..3409d37a31f4c 100644 --- a/test/CodeGen/ARM/vpadd.ll +++ b/test/CodeGen/ARM/vpadd.ll @@ -485,6 +485,26 @@ define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) { ret <2 x i16> %x } +; And <2 x i8> to <2 x i32> +define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) { +; CHECK-LABEL: fromExtendingExtractVectorElt_2i8: +; CHECK: vadd.i32 + %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2> + %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3> + %x = add <2 x i8> %tmp2, %tmp1 + ret <2 x i8> %x +} + +define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) { +; CHECK-LABEL: fromExtendingExtractVectorElt_2i16: +; CHECK: vadd.i32 + %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2> + %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3> + %x = add <2 x i16> %tmp2, %tmp1 + ret <2 x i16> %x +} + + declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir new file mode 100644 index 0000000000000..b19e44e29fb66 --- /dev/null +++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir @@ -0,0 +1,35 @@ +# RUN: llc -O0 %s -o - -march=avr | FileCheck %s + +# This test checks the expansion of the 16-bit 'LDDWRdPtrQ' pseudo instruction. +# +# This test ensures that the pseudo expander can correctly handle the case +# where we are expanding a 16-bit LDD instruction where the source and +# destination registers are the same. +# +# The instruction itself is earlyclobber and so ISel will never produce an +# instruction like this, but the stack slot loading can and will. + +--- | + target triple = "avr--" + define void @test_lddwrdptrq() { + entry: + ret void + } +... + +--- +name: test_lddwrdptrq +tracksRegLiveness: true +body: | + bb.0.entry: + + ; CHECK-LABEL: test_lddwrdptrq + + ; CHECK: ldd [[SCRATCH:r[0-9]+]], Z+10 + ; CHECK-NEXT: push [[SCRATCH]] + ; CHECK-NEXT: ldd [[SCRATCH]], Z+11 + ; CHECK-NEXT: mov r31, [[SCRATCH]] + ; CHECK-NEXT: pop r30 + + early-clobber %r31r30 = LDDWRdPtrQ undef %r31r30, 10 +... diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir new file mode 100644 index 0000000000000..3e7fdcd400d21 --- /dev/null +++ b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir @@ -0,0 +1,29 @@ +# RUN: llc -O0 %s -o - | FileCheck %s + +# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction. + +--- | + target triple = "avr--" + define void @test_ldwrdptr() { + entry: + ret void + } +... + +--- +name: test_ldwrdptr +tracksRegLiveness: true +body: | + bb.0.entry: + + ; CHECK-LABEL: test_ldwrdptr + + ; CHECK: ld [[SCRATCH:r[0-9]+]], Z + ; CHECK-NEXT: push [[SCRATCH]] + ; CHECK-NEXT: ldd [[SCRATCH]], Z+1 + ; CHECK-NEXT: mov r31, [[SCRATCH]] + ; CHECK-NEXT: pop r30 + + early-clobber %r31r30 = LDWRdPtr undef %r31r30 +... + diff --git a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir deleted file mode 100644 index 8427a2bfb4edf..0000000000000 --- a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir +++ /dev/null @@ -1,35 +0,0 @@ -# RUN: llc -O0 %s -o - -march=avr | FileCheck %s - -# This test ensures that the pseudo expander can correctly handle the case -# where we are expanding a 16-bit LDD instruction where the source and -# destination registers are the same. -# -# The instruction itself is earlyclobber and so ISel will never produce an -# instruction like this, but the stack slot loading can and will. - ---- | - target triple = "avr--" - - define void @test_lddw() { - entry: - ret void - } - -... ---- -name: test_lddw -tracksRegLiveness: true -stack: - - { id: 0, type: spill-slot, offset: -4, size: 1, alignment: 1, callee-saved-register: '%r28' } -body: | - bb.0.entry: - liveins: %r28, %r29 - - ; CHECK-LABEL: test_lddw - - ; CHECK: ldd [[TMPREG:r[0-9]+]], Y+0 - ; CHECK-NEXT: mov r28, [[TMPREG]] - ; CHECK-NEXT: ldd [[TMPREG]], Y+1 - ; CHECK-NEXT: mov r29, [[TMPREG]] - dead early-clobber %r29r28 = LDDWRdYQ killed %r29r28, 0 -... diff --git a/test/CodeGen/MSP430/select-use-sr.ll b/test/CodeGen/MSP430/select-use-sr.ll new file mode 100644 index 0000000000000..3f67fb85f793f --- /dev/null +++ b/test/CodeGen/MSP430/select-use-sr.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -march=msp430 | FileCheck %s +; PR32769 + +target triple = "msp430" + +; Test that CMP instruction is not removed by MachineCSE. +; +; CHECK-LABEL: @f +; CHECK: cmp.w r15, r13 +; CHECK: cmp.w r15, r13 +; CHECK-NEXT: jeq .LBB0_2 +define i16 @f(i16, i16, i16, i16) { +entry: + %4 = icmp ult i16 %1, %3 + %5 = zext i1 %4 to i16 + %6 = icmp ult i16 %0, %2 + %7 = zext i1 %6 to i16 + %8 = icmp eq i16 %1, %3 + %out = select i1 %8, i16 %5, i16 %7 + ret i16 %out +} diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll index 1562372ce9a09..20853073dfa6f 100644 --- a/test/CodeGen/Mips/llvm-ir/mul.ll +++ b/test/CodeGen/Mips/llvm-ir/mul.ll @@ -268,7 +268,7 @@ entry: ; MM64R6: daddu $2, $[[T1]], $[[T0]] ; MM64R6-DAG: dmul $3, $5, $7 - ; MM32: lw $25, %call16(__multi3)($16) + ; MM32: lw $25, %call16(__multi3)($gp) %r = mul i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll index defd25bb41acf..ee2b212a9f2f4 100644 --- a/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -172,7 +172,7 @@ entry: ; 64R6: ddiv $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__divdi3)($2) + ; MM32: lw $25, %call16(__divdi3)($gp) ; MM64: ddiv $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -184,15 +184,7 @@ entry: define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: sdiv_i128: - - ; GP32: lw $25, %call16(__divti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__divti3)($gp) - ; 64R6: ld $25, %call16(__divti3)($gp) - - ; MM32: lw $25, %call16(__divti3)($16) - - ; MM64: ld $25, %call16(__divti3)($2) + ; ALL: l{{w|d}} $25, %call16(__divti3)($gp) %r = sdiv i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll index 42664d7457e5a..812c105669799 100644 --- a/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/test/CodeGen/Mips/llvm-ir/srem.ll @@ -164,7 +164,7 @@ entry: ; 64R6: dmod $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__moddi3)($2) + ; MM32: lw $25, %call16(__moddi3)($gp) ; MM64: dmod $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -177,14 +177,7 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: srem_i128: - ; GP32: lw $25, %call16(__modti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__modti3)($gp) - ; 64R6: ld $25, %call16(__modti3)($gp) - - ; MM32: lw $25, %call16(__modti3)($16) - - ; MM64: ld $25, %call16(__modti3)($2) + ; ALL: l{{w|d}} $25, %call16(__modti3)($gp) %r = srem i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll index 78ab36442a9ae..6e078fdedfca3 100644 --- a/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -134,7 +134,7 @@ entry: ; 64R6: ddivu $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__udivdi3)($2) + ; MM32: lw $25, %call16(__udivdi3)($gp) ; MM64: ddivu $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -147,14 +147,7 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: udiv_i128: - ; GP32: lw $25, %call16(__udivti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__udivti3)($gp) - ; 64-R6: ld $25, %call16(__udivti3)($gp) - - ; MM32: lw $25, %call16(__udivti3)($16) - - ; MM64: ld $25, %call16(__udivti3)($2) + ; ALL: l{{w|d}} $25, %call16(__udivti3)($gp) %r = udiv i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll index 160c126c7e3ab..3bc82ceecd2a6 100644 --- a/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/test/CodeGen/Mips/llvm-ir/urem.ll @@ -190,7 +190,7 @@ entry: ; 64R6: dmodu $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__umoddi3)($2) + ; MM32: lw $25, %call16(__umoddi3)($gp) ; MM64: dmodu $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -208,9 +208,9 @@ entry: ; GP64-NOT-R6: ld $25, %call16(__umodti3)($gp) ; 64R6: ld $25, %call16(__umodti3)($gp) - ; MM32: lw $25, %call16(__umodti3)($16) + ; MM32: lw $25, %call16(__umodti3)($gp) - ; MM64: ld $25, %call16(__umodti3)($2) + ; MM64: ld $25, %call16(__umodti3)($gp) %r = urem i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/micromips-gp-rc.ll b/test/CodeGen/Mips/micromips-gp-rc.ll index f139f7a8486da..16e55c357db68 100644 --- a/test/CodeGen/Mips/micromips-gp-rc.ll +++ b/test/CodeGen/Mips/micromips-gp-rc.ll @@ -14,5 +14,5 @@ entry: ; Function Attrs: noreturn declare void @exit(i32 signext) -; CHECK: move $gp, ${{[0-9]+}} +; CHECK: addu $gp, ${{[0-9]+}} diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll index 564ffdd2f691b..6fa506849ee6b 100644 --- a/test/CodeGen/Mips/mips64fpldst.ll +++ b/test/CodeGen/Mips/mips64fpldst.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 -; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 -; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 @f0 = common global float 0.000000e+00, align 4 @d0 = common global double 0.000000e+00, align 8 diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll index 3f04e1cf30531..01a9b64ba63c6 100644 --- a/test/CodeGen/Mips/tailcall/tailcall.ll +++ b/test/CodeGen/Mips/tailcall/tailcall.ll @@ -176,7 +176,7 @@ entry: ; ALL-LABEL: caller8_1: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{.*}} $25 ; STATIC32: jal ; PIC64: jalr $25 ; STATIC64: jal @@ -288,7 +288,7 @@ entry: ; ALL-LABEL: caller13: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{.*}} $25 ; STATIC32: jal ; STATIC64: jal ; PIC64R6: jalr $25 diff --git a/test/CodeGen/PowerPC/empty-functions.ll b/test/CodeGen/PowerPC/empty-functions.ll index 56db8f39bffdd..b8394e14318fb 100644 --- a/test/CodeGen/PowerPC/empty-functions.ll +++ b/test/CodeGen/PowerPC/empty-functions.ll @@ -24,9 +24,7 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]] ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: {{^}}.L[[BEGIN:.*]]:{{$}} ; LINUX-FP-NEXT: .cfi_startproc @@ -38,8 +36,6 @@ entry: ; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .cfi_offset r31, -4 ; LINUX-FP-NEXT: mr 31, 1 -; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register r31 ; LINUX-FP-NEXT: {{^}}.L[[END:.*]]:{{$}} ; LINUX-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]] ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/SPARC/empty-functions.ll b/test/CodeGen/SPARC/empty-functions.ll index 1f8c5e3a312d0..974df232033a5 100644 --- a/test/CodeGen/SPARC/empty-functions.ll +++ b/test/CodeGen/SPARC/empty-functions.ll @@ -14,19 +14,11 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: .cfi_startproc ; LINUX-FP-NEXT: {{^}}! ; LINUX-FP-NEXT: save %sp, -96, %sp ; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register %fp -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_window_save -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_register 15, 31 -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .size func, .Lfunc_end0-func ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll new file mode 100644 index 0000000000000..fc3b7ef1dadeb --- /dev/null +++ b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll @@ -0,0 +1,229 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs -disable-lsr | FileCheck %s +; +; Regression test for a machine verifier complaint discovered with llvm-stress. +; Test that splitting of a 128 bit store does not result in use of undef phys reg. +; This test case involved spilling of 128 bits, where the data operand was killed. + +define void @autogen_SD15107(i8*, i32*, i64*, i32, i64, i8) { +; CHECK: .text +BB: + %A4 = alloca double + %A1 = alloca i32 + %L = load i8, i8* %0 + br label %CF331 + +CF331: ; preds = %CF331, %BB + %Shuff = shufflevector <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, <8 x i32> <i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11> + %L5 = load i8, i8* %0 + %FC9 = fptosi float 0xC59D259100000000 to i8 + %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 10, i32 undef, i32 14, i32 0, i32 undef, i32 4, i32 6, i32 8> + %Tr = trunc <8 x i16> zeroinitializer to <8 x i1> + %Sl16 = select i1 true, i64 448097, i64 253977 + %E18 = extractelement <2 x i1> zeroinitializer, i32 1 + br i1 %E18, label %CF331, label %CF350 + +CF350: ; preds = %CF331 + %Cmp22 = icmp slt i8 %L, -1 + br label %CF + +CF: ; preds = %CF333, %CF364, %CF, %CF350 + %Shuff25 = shufflevector <16 x i1> zeroinitializer, <16 x i1> zeroinitializer, <16 x i32> <i32 25, i32 27, i32 29, i32 31, i32 1, i32 undef, i32 undef, i32 7, i32 9, i32 11, i32 undef, i32 15, i32 17, i32 19, i32 21, i32 23> + %B27 = mul <8 x i8> zeroinitializer, %Shuff + %L31 = load i8, i8* %0 + store i8 %L5, i8* %0 + %E32 = extractelement <8 x i64> %Shuff13, i32 5 + %Sl37 = select i1 %E18, i64* %2, i64* %2 + %E40 = extractelement <8 x i64> %Shuff13, i32 4 + %I42 = insertelement <8 x i64> %Shuff13, i64 0, i32 1 + %Sl44 = select i1 true, double* %A4, double* %A4 + %L46 = load i64, i64* %Sl37 + br i1 undef, label %CF, label %CF335 + +CF335: ; preds = %CF335, %CF + %Shuff48 = shufflevector <8 x i16> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> <i32 undef, i32 15, i32 undef, i32 3, i32 5, i32 7, i32 9, i32 11> + %B50 = sub <8 x i64> undef, zeroinitializer + %Se = sext i1 %Cmp22 to i64 + %Cmp52 = icmp ule i64 %E40, 184653 + br i1 %Cmp52, label %CF335, label %CF364 + +CF364: ; preds = %CF335 + store i64 %E32, i64* %Sl37 + %B57 = udiv <8 x i64> %I42, %B50 + %L61 = load i64, i64* %Sl37 + %Sl65 = select i1 undef, i1 %Cmp52, i1 true + br i1 %Sl65, label %CF, label %CF333 + +CF333: ; preds = %CF364 + %Cmp66 = fcmp uge float 0x474A237E00000000, undef + br i1 %Cmp66, label %CF, label %CF324 + +CF324: ; preds = %CF358, %CF360, %CF333 + %L67 = load i64, i64* %Sl37 + %Sl73 = select i1 %E18, i8 %L, i8 %L31 + %ZE = zext i1 true to i32 + %Cmp81 = icmp ult i64 184653, %L46 + br label %CF346 + +CF346: ; preds = %CF363, %CF346, %CF324 + %L82 = load double, double* %Sl44 + store i64 %Se, i64* %Sl37 + br i1 undef, label %CF346, label %CF363 + +CF363: ; preds = %CF346 + %I85 = insertelement <8 x i64> undef, i64 0, i32 4 + %Se86 = sext i1 %Cmp81 to i64 + %Cmp88 = icmp eq <16 x i1> zeroinitializer, undef + %Shuff91 = shufflevector <8 x i64> %B57, <8 x i64> %I42, <8 x i32> <i32 1, i32 undef, i32 5, i32 7, i32 undef, i32 11, i32 13, i32 undef> + %Sl95 = select i1 undef, i8 -1, i8 %5 + store i8 %FC9, i8* %0 + %Sl102 = select i1 %Sl65, float 0x3AAFABC380000000, float undef + %L104 = load i64, i64* %Sl37 + store i8 %Sl95, i8* %0 + br i1 undef, label %CF346, label %CF360 + +CF360: ; preds = %CF363 + %I107 = insertelement <16 x i1> undef, i1 %Sl65, i32 3 + %B108 = fdiv float undef, %Sl102 + %FC109 = sitofp <16 x i1> %Shuff25 to <16 x float> + %Cmp111 = icmp slt i8 %Sl73, %Sl95 + br i1 %Cmp111, label %CF324, label %CF344 + +CF344: ; preds = %CF344, %CF360 + store i64 %4, i64* %Sl37 + br i1 undef, label %CF344, label %CF358 + +CF358: ; preds = %CF344 + %B116 = add i8 29, %5 + %Sl118 = select i1 %Cmp81, <8 x i1> undef, <8 x i1> %Tr + %L120 = load i16, i16* undef + store i8 %FC9, i8* %0 + %E121 = extractelement <16 x i1> %Shuff25, i32 3 + br i1 %E121, label %CF324, label %CF325 + +CF325: ; preds = %CF362, %CF358 + %I123 = insertelement <8 x i16> undef, i16 %L120, i32 0 + %Sl125 = select i1 undef, i32 undef, i32 199785 + %Cmp126 = icmp ule <16 x i1> undef, %Cmp88 + br label %CF356 + +CF356: ; preds = %CF356, %CF325 + %FC131 = sitofp <8 x i8> %B27 to <8 x double> + store i8 %Sl73, i8* %0 + store i64 396197, i64* %Sl37 + %L150 = load i64, i64* %Sl37 + %Cmp157 = icmp ult i64 %L150, %L61 + br i1 %Cmp157, label %CF356, label %CF359 + +CF359: ; preds = %CF359, %CF356 + %B162 = srem <8 x i64> %I85, %Shuff13 + %Tr163 = trunc i64 %Se to i8 + %Sl164 = select i1 %Cmp52, i32* %A1, i32* %1 + store i64 %E32, i64* undef + %I168 = insertelement <8 x i16> %I123, i16 undef, i32 5 + %Se170 = sext i1 %Cmp81 to i32 + %Cmp172 = icmp uge i8 %Sl73, %Sl73 + br i1 %Cmp172, label %CF359, label %CF362 + +CF362: ; preds = %CF359 + store i16 0, i16* undef + store i64 448097, i64* %Sl37 + %E189 = extractelement <8 x i16> %Shuff48, i32 6 + %Sl194 = select i1 %Cmp111, i8 29, i8 0 + %Cmp195 = icmp eq i32 %ZE, %ZE + br i1 %Cmp195, label %CF325, label %CF326 + +CF326: ; preds = %CF342, %CF362 + store i64 %L104, i64* undef + br label %CF342 + +CF342: ; preds = %CF326 + %Cmp203 = icmp ule i1 %Cmp195, %E18 + br i1 %Cmp203, label %CF326, label %CF337 + +CF337: ; preds = %CF342 + br label %CF327 + +CF327: ; preds = %CF336, %CF355, %CF327, %CF337 + store i64 %Se86, i64* undef + %Tr216 = trunc i64 184653 to i16 + %Sl217 = select i1 %Cmp157, <4 x i1> undef, <4 x i1> undef + %Cmp218 = icmp slt i32 undef, %Se170 + br i1 %Cmp218, label %CF327, label %CF355 + +CF355: ; preds = %CF327 + %E220 = extractelement <16 x i1> %Cmp126, i32 3 + br i1 %E220, label %CF327, label %CF340 + +CF340: ; preds = %CF355 + %Sl224 = select i1 %Sl65, double undef, double 0xBE278346AB25A5C4 + br label %CF334 + +CF334: ; preds = %CF343, %CF334, %CF340 + %L226 = load i64, i64* undef + store i32 %3, i32* %Sl164 + %Cmp233 = icmp uge i16 %Tr216, %L120 + br i1 %Cmp233, label %CF334, label %CF354 + +CF354: ; preds = %CF334 + store i64 %L226, i64* %Sl37 + %Cmp240 = icmp uge i1 %Cmp52, undef + %Shuff243 = shufflevector <16 x i1> %I107, <16 x i1> undef, <16 x i32> <i32 28, i32 30, i32 undef, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 undef> + %B245 = fmul <16 x float> %FC109, %FC109 + br label %CF343 + +CF343: ; preds = %CF354 + %Cmp248 = icmp sgt i8 0, %B116 + br i1 %Cmp248, label %CF334, label %CF336 + +CF336: ; preds = %CF343 + store i64 %E32, i64* undef + br i1 undef, label %CF327, label %CF328 + +CF328: ; preds = %CF345, %CF336 + br label %CF345 + +CF345: ; preds = %CF328 + %E257 = extractelement <4 x i1> %Sl217, i32 2 + br i1 %E257, label %CF328, label %CF338 + +CF338: ; preds = %CF345 + %Sl261 = select i1 %E121, <8 x i16> zeroinitializer, <8 x i16> undef + %Cmp262 = icmp sgt i8 undef, %Sl194 + br label %CF329 + +CF329: ; preds = %CF339, %CF348, %CF357, %CF338 + store i64 %L67, i64* %Sl37 + br label %CF357 + +CF357: ; preds = %CF329 + %Cmp275 = icmp ne i1 %Cmp203, %Sl65 + br i1 %Cmp275, label %CF329, label %CF348 + +CF348: ; preds = %CF357 + %Shuff286 = shufflevector <8 x i16> undef, <8 x i16> %Sl261, <8 x i32> <i32 6, i32 8, i32 10, i32 12, i32 undef, i32 0, i32 2, i32 4> + %Cmp291 = icmp ne i32 %Sl125, undef + br i1 %Cmp291, label %CF329, label %CF339 + +CF339: ; preds = %CF348 + %Cmp299 = fcmp ugt double %L82, undef + br i1 %Cmp299, label %CF329, label %CF330 + +CF330: ; preds = %CF361, %CF330, %CF339 + %E301 = extractelement <8 x double> %FC131, i32 3 + store i64 %Sl16, i64* %Sl37 + %Se313 = sext <8 x i1> %Sl118 to <8 x i32> + %Cmp315 = icmp sgt i8 %Tr163, %L + br i1 %Cmp315, label %CF330, label %CF361 + +CF361: ; preds = %CF330 + store i16 %L120, i16* undef + %Shuff318 = shufflevector <8 x i64> %B162, <8 x i64> undef, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6> + %ZE321 = zext i16 %E189 to i64 + %Sl322 = select i1 %Cmp240, i1 %Cmp262, i1 %Cmp291 + br i1 %Sl322, label %CF330, label %CF351 + +CF351: ; preds = %CF361 + store double %Sl224, double* %Sl44 + store i32 %ZE, i32* %Sl164 + ret void +} diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll index c549bd425aafe..13951ef4354b4 100644 --- a/test/CodeGen/Thumb/long.ll +++ b/test/CodeGen/Thumb/long.ll @@ -206,3 +206,34 @@ entry: ; CHECK: adds r0, r0, r2 ; CHECK: sbcs r1, r3 } + +declare void @f13(i64 %x) + +define void @f14(i1 %x, i64 %y) #0 { +; CHECK-LABEL: f14: +entry: + %a = add i64 %y, 47 + call void @f13(i64 %a) +; CHECK: bl + br i1 %x, label %if.end, label %if.then + +if.then: + call void @f13(i64 %y) +; CHECK: bl + br label %if.end + +if.end: + %b = add i64 %y, 45 + call void @f13(i64 %b) +; CHECK: adds +; CHECK: adcs +; CHECK: bl + %c = add i64 %y, 47 + call void @f13(i64 %c) +; CHECK: adds +; CHECK-NEXT: adcs +; CHECK: bl + ret void +} + +attributes #0 = { optsize } diff --git a/test/CodeGen/Thumb/optionaldef-scheduling.ll b/test/CodeGen/Thumb/optionaldef-scheduling.ll new file mode 100644 index 0000000000000..bd091cf2b6f84 --- /dev/null +++ b/test/CodeGen/Thumb/optionaldef-scheduling.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s +; RUN: llc -mtriple=thumbv6-eabi %s -verify-machineinstrs -o - | FileCheck %s + +define i1 @test(i64 %arg) { +entry: + %ispos = icmp sgt i64 %arg, -1 + %neg = sub i64 0, %arg + %sel = select i1 %ispos, i64 %arg, i64 %neg + %cmp2 = icmp eq i64 %sel, %arg + ret i1 %cmp2 +} + +; The scheduler used to ignore OptionalDefs, and could unwittingly insert +; a flag-setting instruction in between an ADDS and the corresponding ADC. + +; CHECK: adds +; CHECK-NOT: eors +; CHECK: adcs diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll new file mode 100644 index 0000000000000..ec62ece6d408b --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/callingconv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL +; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL +; RUN: llc -mtriple=x86_64-linux-gnu < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL + +define i32 @test_ret_i32() { +; X32-LABEL: test_ret_i32: +; X32: # BB#0: +; X32-NEXT: movl $20, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_ret_i32: +; X64: # BB#0: +; X64-NEXT: movl $20, %eax +; X64-NEXT: retq + ret i32 20 +} + +define i64 @test_ret_i64() { +; X32_GISEL-LABEL: test_ret_i64: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X32_GISEL-NEXT: movl $15, %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_ret_i64: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl $-1, %eax +; X32_ISEL-NEXT: movl $15, %edx +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_ret_i64: +; X64: # BB#0: +; X64-NEXT: movabsq $68719476735, %rax # imm = 0xFFFFFFFFF +; X64-NEXT: retq + ret i64 68719476735 +} + +define i32 @test_arg_i32(i32 %a) { +; X32_GISEL-LABEL: test_arg_i32: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 4(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_arg_i32: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 4(%esp), %eax +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_arg_i32: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq + ret i32 %a +} + +define i64 @test_arg_i64(i64 %a) { +; X32_GISEL-LABEL: test_arg_i64: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 4(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: leal 8(%esp), %ecx +; X32_GISEL-NEXT: movl (%ecx), %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_arg_i64: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 4(%esp), %eax +; X32_ISEL-NEXT: movl 8(%esp), %edx +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_arg_i64: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + ret i64 %a +} + +define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) { +; X32_GISEL-LABEL: test_i64_args_8: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 60(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: leal 64(%esp), %ecx +; X32_GISEL-NEXT: movl (%ecx), %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_i64_args_8: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 60(%esp), %eax +; X32_ISEL-NEXT: movl 64(%esp), %edx +; X32_ISEL-NEXT: retl +; +; X64_GISEL-LABEL: test_i64_args_8: +; X64_GISEL: # BB#0: +; X64_GISEL-NEXT: leaq 16(%rsp), %rax +; X64_GISEL-NEXT: movq (%rax), %rax +; X64_GISEL-NEXT: retq +; +; X64_ISEL-LABEL: test_i64_args_8: +; X64_ISEL: # BB#0: +; X64_ISEL-NEXT: movq 16(%rsp), %rax +; X64_ISEL-NEXT: retq + + ret i64 %arg8 +} + +define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) { +; X32-LABEL: test_v4i32_args: +; X32: # BB#0: +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_v4i32_args: +; X64: # BB#0: +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + ret <4 x i32> %arg2 +} + +define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) { +; X32-LABEL: test_v8i32_args: +; X32: # BB#0: +; X32-NEXT: retl +; +; X64-LABEL: test_v8i32_args: +; X64: # BB#0: +; X64-NEXT: retq + + ret <8 x i32> %arg1 +} diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll index 616cb70652bb1..8ea3e4f9d739a 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll @@ -207,24 +207,15 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, ; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]] ; X32-NEXT: [[ARG8H:%[0-9]+]](s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0) -; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG1_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG1L]](s32), 0 -; X32-NEXT: [[ARG1_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](s32), 32 -; X32-NEXT: [[ARG1:%[0-9]+]](s64) = COPY [[ARG1_TMP1]] - ; ... a bunch more that we don't track ... - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF -; X32: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG7_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG7L]](s32), 0 -; X32-NEXT: [[ARG7_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG7_TMP0]], [[ARG7H]](s32), 32 -; X32-NEXT: [[ARG7:%[0-9]+]](s64) = COPY [[ARG7_TMP1]] -; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG8_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG8L]](s32), 0 -; X32-NEXT: [[ARG8_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG8_TMP0]], [[ARG8H]](s32), 32 -; X32-NEXT: [[ARG8:%[0-9]+]](s64) = COPY [[ARG8_TMP1]] +; X32-NEXT: [[ARG1:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG1L]](s32), [[ARG1H]](s32) +; ... a bunch more that we don't track ... +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: [[ARG7:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG7L]](s32), [[ARG7H]](s32) +; X32-NEXT: [[ARG8:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG8L]](s32), [[ARG8H]](s32) ; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit ; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit @@ -236,8 +227,7 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, ; X64-NEXT: %rax = COPY [[ARG1]](s64) ; X64-NEXT: RET 0, implicit %rax -; X32-NEXT: [[RETL:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 0 -; X32-NEXT: [[RETH:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 32 +; X32-NEXT: [[RETL:%[0-9]+]](s32), [[RETH:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](s64) ; X32-NEXT: %eax = COPY [[RETL:%[0-9]+]](s32) ; X32-NEXT: %edx = COPY [[RETH:%[0-9]+]](s32) ; X32-NEXT: RET 0, implicit %eax, implicit %edx diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll index e2d938550aea0..90a05f5fc225c 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll @@ -15,12 +15,8 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) { ; X64: liveins: %xmm0, %xmm1 ; X64: [[ARG1L:%[0-9]+]](<4 x s32>) = COPY %xmm0 ; X64-NEXT: [[ARG1H:%[0-9]+]](<4 x s32>) = COPY %xmm1 -; X64-NEXT: [[UNDEF:%[0-9]+]](<8 x s32>) = IMPLICIT_DEF -; X64-NEXT: [[ARG1_TMP0:%[0-9]+]](<8 x s32>) = G_INSERT [[UNDEF]], [[ARG1L]](<4 x s32>), 0 -; X64-NEXT: [[ARG1_TMP1:%[0-9]+]](<8 x s32>) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](<4 x s32>), 128 -; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = COPY [[ARG1_TMP1]] -; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 0 -; X64-NEXT: [[RETH:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 128 +; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = G_MERGE_VALUES [[ARG1L]](<4 x s32>), [[ARG1H]](<4 x s32>) +; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>), [[RETH:%[0-9]+]](<4 x s32>) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](<8 x s32>) ; X64-NEXT: %xmm0 = COPY [[RETL:%[0-9]+]](<4 x s32>) ; X64-NEXT: %xmm1 = COPY [[RETH:%[0-9]+]](<4 x s32>) ; X64-NEXT: RET 0, implicit %xmm0, implicit %xmm1 diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll index 6fe66436e4a8a..f793e36026b1a 100644 --- a/test/CodeGen/X86/GlobalISel/memop.ll +++ b/test/CodeGen/X86/GlobalISel/memop.ll @@ -65,7 +65,7 @@ define double @test_load_double(double * %p1) { ; SSE-LABEL: test_load_double: ; SSE: # BB#0: ; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; ALL_AVX-LABEL: test_load_double: @@ -160,7 +160,7 @@ define double * @test_store_double(double %val, double * %p1) { ; ; SSE_FAST-LABEL: test_store_double: ; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movd %xmm0, %rax +; SSE_FAST-NEXT: movq %xmm0, %rax ; SSE_FAST-NEXT: movq %rax, (%rdi) ; SSE_FAST-NEXT: movq %rdi, %rax ; SSE_FAST-NEXT: retq diff --git a/test/CodeGen/X86/asm-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll index 47accdbc07b33..ced074015acef 100644 --- a/test/CodeGen/X86/asm-reg-type-mismatch.ll +++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll @@ -27,5 +27,5 @@ entry: ret i64 %0 ; CHECK: test2 ; CHECK: movq {{.*}}, %xmm7 - ; CHECK: movd %xmm7, %rax + ; CHECK: movq %xmm7, %rax } diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll index 17b73ecf4e1c7..1f25c71a9f762 100644 --- a/test/CodeGen/X86/atomic-non-integer.ll +++ b/test/CodeGen/X86/atomic-non-integer.ll @@ -26,7 +26,7 @@ define void @store_float(float* %fptr, float %v) { define void @store_double(double* %fptr, double %v) { ; CHECK-LABEL: @store_double -; CHECK: movd %xmm0, %rax +; CHECK: movq %xmm0, %rax ; CHECK: movq %rax, (%rdi) store atomic double %v, double* %fptr unordered, align 8 ret void @@ -59,7 +59,7 @@ define float @load_float(float* %fptr) { define double @load_double(double* %fptr) { ; CHECK-LABEL: @load_double ; CHECK: movq (%rdi), %rax -; CHECK: movd %rax, %xmm0 +; CHECK: movq %rax, %xmm0 %v = load atomic double, double* %fptr unordered, align 8 ret double %v } @@ -85,7 +85,7 @@ define void @store_float_seq_cst(float* %fptr, float %v) { define void @store_double_seq_cst(double* %fptr, double %v) { ; CHECK-LABEL: @store_double_seq_cst -; CHECK: movd %xmm0, %rax +; CHECK: movq %xmm0, %rax ; CHECK: xchgq %rax, (%rdi) store atomic double %v, double* %fptr seq_cst, align 8 ret void @@ -102,7 +102,7 @@ define float @load_float_seq_cst(float* %fptr) { define double @load_double_seq_cst(double* %fptr) { ; CHECK-LABEL: @load_double_seq_cst ; CHECK: movq (%rdi), %rax -; CHECK: movd %rax, %xmm0 +; CHECK: movq %rax, %xmm0 %v = load atomic double, double* %fptr seq_cst, align 8 ret double %v } diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll new file mode 100644 index 0000000000000..052cacfea4dc0 --- /dev/null +++ b/test/CodeGen/X86/avx-schedule.ll @@ -0,0 +1,2840 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_addpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fadd <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_addps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fadd <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_addsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_addsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_andnotpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> + %4 = and <4 x i64> %3, %2 + %5 = load <4 x double>, <4 x double> *%a2, align 32 + %6 = bitcast <4 x double> %5 to <4 x i64> + %7 = xor <4 x i64> %4, <i64 -1, i64 -1, i64 -1, i64 -1> + %8 = and <4 x i64> %6, %7 + %9 = bitcast <4 x i64> %8 to <4 x double> + %10 = fadd <4 x double> %a1, %9 + ret <4 x double> %10 +} + +define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_andnotps: +; SANDY: # BB#0: +; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> + %4 = and <4 x i64> %3, %2 + %5 = load <8 x float>, <8 x float> *%a2, align 32 + %6 = bitcast <8 x float> %5 to <4 x i64> + %7 = xor <4 x i64> %4, <i64 -1, i64 -1, i64 -1, i64 -1> + %8 = and <4 x i64> %6, %7 + %9 = bitcast <4 x i64> %8 to <8 x float> + %10 = fadd <8 x float> %a1, %9 + ret <8 x float> %10 +} + +define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_andpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = and <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = and <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_andps: +; SANDY: # BB#0: +; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = and <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = and <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_blendpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fadd <4 x double> %a1, %1 + %4 = shufflevector <4 x double> %3, <4 x double> %2, <4 x i32> <i32 0, i32 5, i32 6, i32 3> + ret <4 x double> %4 +} + +define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_blendps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33] +; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7> + ret <8 x float> %3 +} + +define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) { +; SANDY-LABEL: test_blendvpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = load <4 x double>, <4 x double> *%a3, align 32 + %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) { +; SANDY-LABEL: test_blendvps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = load <8 x float>, <8 x float> *%a3, align 32 + %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone + +define <8 x float> @test_broadcastf128(<4 x float> *%a0) { +; SANDY-LABEL: test_broadcastf128: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x float>, <4 x float> *%a0, align 32 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %2 +} + +define <4 x double> @test_broadcastsd_ymm(double *%a0) { +; SANDY-LABEL: test_broadcastsd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastsd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastsd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastsd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load double, double *%a0, align 8 + %2 = insertelement <4 x double> undef, double %1, i32 0 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 +} + +define <4 x float> @test_broadcastss(float *%a0) { +; SANDY-LABEL: test_broadcastss: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load float, float *%a0, align 4 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %3 +} + +define <8 x float> @test_broadcastss_ymm(float *%a0) { +; SANDY-LABEL: test_broadcastss_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastss_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastss_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastss_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load float, float *%a0, align 4 + %2 = insertelement <8 x float> undef, float %1, i32 0 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 +} + +define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_cmppd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmppd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmppd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fcmp oeq <4 x double> %a0, %2 + %4 = sext <4 x i1> %1 to <4 x i64> + %5 = sext <4 x i1> %3 to <4 x i64> + %6 = or <4 x i64> %4, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + ret <4 x double> %7 +} + +define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_cmpps: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fcmp oeq <8 x float> %a0, %2 + %4 = sext <8 x i1> %1 to <8 x i32> + %5 = sext <8 x i1> %3 to <8 x i32> + %6 = or <8 x i32> %4, %5 + %7 = bitcast <8 x i32> %6 to <8 x float> + ret <8 x float> %7 +} + +define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { +; SANDY-LABEL: test_cvtdq2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = sitofp <4 x i32> %a0 to <4 x double> + %2 = load <4 x i32>, <4 x i32> *%a1, align 16 + %3 = sitofp <4 x i32> %2 to <4 x double> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { +; SANDY-LABEL: test_cvtdq2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] +; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = sitofp <8 x i32> %a0 to <8 x float> + %2 = load <8 x i32>, <8 x i32> *%a1, align 16 + %3 = sitofp <8 x i32> %2 to <8 x float> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_cvtpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00] +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptosi <4 x double> %a0 to <4 x i32> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = fptosi <4 x double> %2 to <4 x i32> + %4 = shufflevector <4 x i32> %1, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %4 +} + +define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_cvtpd2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptrunc <4 x double> %a0 to <4 x float> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = fptrunc <4 x double> %2 to <4 x float> + %4 = shufflevector <4 x float> %1, <4 x float> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %4 +} + +define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_cvtps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] +; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptosi <8 x float> %a0 to <8 x i32> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = fptosi <8 x float> %2 to <8 x i32> + %4 = or <8 x i32> %1, %3 + ret <8 x i32> %4 +} + +define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_divpd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00] +; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fdiv <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fdiv <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_divps: +; SANDY: # BB#0: +; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fdiv <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fdiv <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_dpps: +; SANDY: # BB#0: +; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_dpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00] +; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_dpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_extractf128: +; SANDY: # BB#0: +; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_extractf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_extractf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extractf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + store <4 x float> %2, <4 x float> *%a2 + ret <4 x float> %1 +} + +define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_haddpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_haddps: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_hsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_hsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_insertf128: +; SANDY: # BB#0: +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00] +; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_insertf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_insertf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] +; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] +; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + %3 = load <4 x float>, <4 x float> *%a2, align 16 + %4 = shufflevector <4 x float> %3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = shufflevector <8 x float> %a0, <8 x float> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + %6 = fadd <8 x float> %2, %5 + ret <8 x float> %6 +} + +define <32 x i8> @test_lddqu(i8* %a0) { +; SANDY-LABEL: test_lddqu: +; SANDY: # BB#0: +; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_lddqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lddqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lddqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) + ret <32 x i8> %1 +} +declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly + +define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { +; SANDY-LABEL: test_maskmovpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) + call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) + ret <2 x double> %1 +} +declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly +declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind + +define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) { +; SANDY-LABEL: test_maskmovpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) + call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) + ret <4 x double> %1 +} +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readonly +declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind + +define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { +; SANDY-LABEL: test_maskmovps: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) + call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) + ret <4 x float> %1 +} +declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly +declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind + +define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) { +; SANDY-LABEL: test_maskmovps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) + call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) + ret <8 x float> %1 +} +declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readonly +declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind + +define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_maxpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_maxps: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_minpd: +; SANDY: # BB#0: +; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_minps: +; SANDY: # BB#0: +; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movapd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movapd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movapd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movapd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x double>, <4 x double> *%a0, align 32 + %2 = fadd <4 x double> %1, %1 + store <4 x double> %2, <4 x double> *%a1, align 32 + ret <4 x double> %2 +} + +define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movaps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movaps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movaps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movaps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <8 x float>, <8 x float> *%a0, align 32 + %2 = fadd <8 x float> %1, %1 + store <8 x float> %2, <8 x float> *%a1, align 32 + ret <8 x float> %2 +} + +define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movddup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] +; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movddup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] +; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movddup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movddup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define i32 @test_movmskpd(<4 x double> %a0) { +; SANDY-LABEL: test_movmskpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone + +define i32 @test_movmskps(<8 x float> %a0) { +; SANDY-LABEL: test_movmskps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movntpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <4 x double> %a0, %a0 + store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0 + ret <4 x double> %1 +} + +define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movntps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <8 x float> %a0, %a0 + store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0 + ret <8 x float> %1 +} + +define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movshdup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] +; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movshdup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] +; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movshdup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movshdup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movsldup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] +; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsldup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] +; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsldup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsldup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movupd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movupd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movupd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movupd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x double>, <4 x double> *%a0, align 1 + %2 = fadd <4 x double> %1, %1 + store <4 x double> %2, <4 x double> *%a1, align 1 + ret <4 x double> %2 +} + +define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movups: +; SANDY: # BB#0: +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] +; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movups: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movups: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movups: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <8 x float>, <8 x float> *%a0, align 1 + %2 = fadd <8 x float> %1, %1 + store <8 x float> %2, <8 x float> *%a1, align 1 + ret <8 x float> %2 +} + +define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_mulpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fmul <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fmul <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_mulps: +; SANDY: # BB#0: +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fmul <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fmul <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: orpd: +; SANDY: # BB#0: +; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: orpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: orpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: orpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = or <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = or <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_orps: +; SANDY: # BB#0: +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_orps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_orps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = or <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = or <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { +; SANDY-LABEL: test_permilpd: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; BTVER2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0> + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_permilpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { +; SANDY-LABEL: test_permilps: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; BTVER2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} + +define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_permilps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; SANDY-LABEL: test_permilvarpd: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone + +define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x i64> *%a2) { +; SANDY-LABEL: test_permilvarpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) + %2 = load <4 x i64>, <4 x i64> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone + +define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; SANDY-LABEL: test_permilvarps: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone + +define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> *%a2) { +; SANDY-LABEL: test_permilvarps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + %2 = load <8 x i32>, <8 x i32> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone + +define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_rcpps: +; SANDY: # BB#0: +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rcpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rcpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_roundpd: +; SANDY: # BB#0: +; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7) + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_roundps: +; SANDY: # BB#0: +; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone + +define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_rsqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rsqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rsqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] +; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_shufpd: +; SANDY: # BB#0: +; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] +; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_shufps: +; SANDY: # BB#0: +; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] +; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] +; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] +; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12> + ret <8 x float> %3 +} + +define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_sqrtpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00] +; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2) + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_sqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00] +; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_subpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fsub <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fsub <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_subps: +; SANDY: # BB#0: +; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fsub <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fsub <8 x float> %1, %2 + ret <8 x float> %3 +} + +define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; SANDY-LABEL: test_testpd: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone + +define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_testpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_testps: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testps: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testps: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_testps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_unpckhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] +; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_unpckhps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] +; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x float> %3 +} + +define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_unpcklpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] +; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_unpcklps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] +; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x float> %3 +} + +define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_xorpd: +; SANDY: # BB#0: +; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = xor <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_xorps: +; SANDY: # BB#0: +; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = xor <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +!0 = !{i32 1} diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll index 12aa863a37a15..b75db95869c27 100644 --- a/test/CodeGen/X86/bitcast2.ll +++ b/test/CodeGen/X86/bitcast2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movd | count 2 +; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movq | count 2 ; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp define i64 @test1(double %A) { diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll index d0967c1021492..1b69b5542556a 100644 --- a/test/CodeGen/X86/bool-ext-inc.ll +++ b/test/CodeGen/X86/bool-ext-inc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s ; FIXME: add (sext i1 X), 1 -> zext (not i1 X) @@ -20,13 +20,93 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # BB#0: -; CHECK-NEXT: pslld $31, %xmm0 -; CHECK-NEXT: psrad $31, %xmm0 -; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> ret <4 x i32> %add } +define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: cmpgt_sext_inc_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +} + +define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: cmpne_sext_inc_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ne <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +} + +define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind { +; CHECK-LABEL: cmpgt_sext_inc_vec256: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <4 x i64> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i64> + %add = add <4 x i64> %ext, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %add +} + +define i32 @bool_logic_and_math(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +; CHECK-LABEL: bool_logic_and_math: +; CHECK: # BB#0: +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setne %al +; CHECK-NEXT: cmpl %ecx, %edx +; CHECK-NEXT: setne %cl +; CHECK-NEXT: andb %al, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %cmp1 = icmp ne i32 %a, %b + %cmp2 = icmp ne i32 %c, %d + %and = and i1 %cmp1, %cmp2 + %ext = sext i1 %and to i32 + %add = add i32 %ext, 1 + ret i32 %add +} + +define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind { +; CHECK-LABEL: bool_logic_and_math_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp1 = icmp ne <4 x i32> %a, %b + %cmp2 = icmp ne <4 x i32> %c, %d + %and = and <4 x i1> %cmp1, %cmp2 + %ext = sext <4 x i1> %and to <4 x i32> + %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +} diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index c425e3a92d173..ae0f4406ba0d2 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -928,7 +928,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rcx ; SSE-NEXT: movq %rcx, %r8 ; SSE-NEXT: movq %rcx, %r9 ; SSE-NEXT: movq %rcx, %r10 @@ -938,7 +938,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE-NEXT: movq %rcx, %rdi ; SSE-NEXT: andb $15, %cl ; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movd %xmm1, %rcx +; SSE-NEXT: movq %xmm1, %rcx ; SSE-NEXT: shrq $56, %rdi ; SSE-NEXT: andb $15, %dil ; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) @@ -1106,7 +1106,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rcx ; SSE-NEXT: movq %rcx, %r8 ; SSE-NEXT: movq %rcx, %r9 ; SSE-NEXT: movq %rcx, %r10 @@ -1116,7 +1116,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE-NEXT: movq %rcx, %rdi ; SSE-NEXT: andb $15, %cl ; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movd %xmm2, %rcx +; SSE-NEXT: movq %xmm2, %rcx ; SSE-NEXT: shrq $56, %rdi ; SSE-NEXT: andb $15, %dil ; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 44c4510c89e1a..706e89051a3da 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -223,18 +223,17 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) { define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; SSE-LABEL: combine_vec_lshr_trunc_lshr0: ; SSE: # BB#0: -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0 ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll index 71f6c3e633342..e1e849929405a 100644 --- a/test/CodeGen/X86/combine-udiv.ll +++ b/test/CodeGen/X86/combine-udiv.ll @@ -76,6 +76,53 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) { ret <4 x i32> %1 } +define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_udiv_by_pow2c: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld %xmm1, %xmm2 +; SSE-NEXT: psrld %xmm3, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_udiv_by_pow2c: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_by_pow2c: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y + %2 = udiv <4 x i32> %x, %1 + ret <4 x i32> %2 +} + ; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_udiv_by_shl_pow2a: diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll index f412e9ca6312b..91da268a8d75a 100644 --- a/test/CodeGen/X86/combine-urem.ll +++ b/test/CodeGen/X86/combine-urem.ll @@ -64,6 +64,99 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ret <4 x i32> %1 } +define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_urem_by_pow2c: +; SSE: # BB#0: +; SSE-NEXT: pslld $23, %xmm1 +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_urem_by_pow2c: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_pow2c: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y + %2 = urem <4 x i32> %x, %1 + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_urem_by_pow2d: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrld %xmm2, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrld %xmm1, %xmm2 +; SSE-NEXT: psrld %xmm4, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_urem_by_pow2d: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_pow2d: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y + %2 = urem <4 x i32> %x, %1 + ret <4 x i32> %2 +} + ; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_shl_pow2a: diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll new file mode 100644 index 0000000000000..83589b7706f75 --- /dev/null +++ b/test/CodeGen/X86/constant-hoisting-bfi.ll @@ -0,0 +1,115 @@ +; RUN: opt -consthoist -mtriple=x86_64-unknown-linux-gnu -consthoist-with-block-frequency=true -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; will not be hoisted to the func entry. +; CHECK-LABEL: @foo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.then: + +; Function Attrs: norecurse nounwind uwtable +define i64 @foo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp slt i64 %t0, 564 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp slt i64 %t1, 1009 + br i1 %cmp2, label %if.then3, label %return + +if.then3: ; preds = %if.then + %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx4, align 8 + %inc = add nsw i64 %t2, 1 + store i64 %inc, i64* %arrayidx4, align 8 + br label %return + +if.else5: ; preds = %entry + %arrayidx6 = getelementptr inbounds i64, i64* %a, i64 6 + %t3 = load i64, i64* %arrayidx6, align 8 + %cmp7 = icmp slt i64 %t3, 3512 + br i1 %cmp7, label %if.then8, label %return + +if.then8: ; preds = %if.else5 + %arrayidx9 = getelementptr inbounds i64, i64* %a, i64 7 + %t4 = load i64, i64* %arrayidx9, align 8 + %inc10 = add nsw i64 %t4, 1 + store i64 %inc10, i64* %arrayidx9, align 8 + br label %return + +return: ; preds = %if.else5, %if.then, %if.then8, %if.then3 + %retval.0 = phi i64 [ 214748364701, %if.then3 ], [ 214748364701, %if.then8 ], [ 250148364702, %if.then ], [ 256148364704, %if.else5 ] + ret i64 %retval.0 +} + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; in while.body will be hoisted to while.body.preheader. 214748364701 in +; if.then16 and if.else10 will be merged and hoisted to the beginning of +; if.else10 because if.else10 dominates if.then16. +; CHECK-LABEL: @goo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: while.body.preheader: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.else10: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +define i64 @goo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp ult i64 %t0, 56 + br i1 %cmp, label %if.then, label %if.else10, !prof !0 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp ult i64 %t1, 10 + br i1 %cmp2, label %while.cond.preheader, label %return, !prof !0 + +while.cond.preheader: ; preds = %if.then + %arrayidx7 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx7, align 8 + %cmp823 = icmp ugt i64 %t2, 10000 + br i1 %cmp823, label %while.body.preheader, label %return + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %t3 = phi i64 [ %add, %while.body ], [ %t2, %while.body.preheader ] + %add = add i64 %t3, 214748364701 + %cmp8 = icmp ugt i64 %add, 10000 + br i1 %cmp8, label %while.body, label %while.cond.return.loopexit_crit_edge + +if.else10: ; preds = %entry + %arrayidx11 = getelementptr inbounds i64, i64* %a, i64 6 + %t4 = load i64, i64* %arrayidx11, align 8 + %add2 = add i64 %t4, 214748364701 + %cmp12 = icmp ult i64 %add2, 35 + br i1 %cmp12, label %if.then16, label %return, !prof !0 + +if.then16: ; preds = %if.else10 + %arrayidx17 = getelementptr inbounds i64, i64* %a, i64 7 + %t5 = load i64, i64* %arrayidx17, align 8 + %inc = add i64 %t5, 1 + store i64 %inc, i64* %arrayidx17, align 8 + br label %return + +while.cond.return.loopexit_crit_edge: ; preds = %while.body + store i64 %add, i64* %arrayidx7, align 8 + br label %return + +return: ; preds = %while.cond.preheader, %while.cond.return.loopexit_crit_edge, %if.else10, %if.then, %if.then16 + %retval.0 = phi i64 [ 214748364701, %if.then16 ], [ 0, %if.then ], [ 0, %if.else10 ], [ 0, %while.cond.return.loopexit_crit_edge ], [ 0, %while.cond.preheader ] + ret i64 %retval.0 +} + +!0 = !{!"branch_weights", i32 1, i32 2000} diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll index a283bcc6d460c..726e30fce63b3 100644 --- a/test/CodeGen/X86/dagcombine-cse.ll +++ b/test/CodeGen/X86/dagcombine-cse.ll @@ -30,7 +30,7 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n ; X64-NEXT: shlq $32, %rcx ; X64-NEXT: movl (%rdi,%rax), %eax ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: movd %rax, %xmm0 +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; X64-NEXT: movd %xmm0, %eax diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll index 612807dd8123e..c2111f672a2e3 100644 --- a/test/CodeGen/X86/dwarf-headers.ll +++ b/test/CodeGen/X86/dwarf-headers.ll @@ -1,16 +1,16 @@ -; RUN: llc -split-dwarf=Disable -dwarf-version=4 -generate-type-units \ +; RUN: llc -dwarf-version=4 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-4 -; RUN: llc -split-dwarf=Enable -dwarf-version=4 -generate-type-units \ +; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=4 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-4 -; RUN: llc -split-dwarf=Disable -dwarf-version=5 -generate-type-units \ +; RUN: llc -dwarf-version=5 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-5 -; RUN: llc -split-dwarf=Enable -dwarf-version=5 -generate-type-units \ +; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=5 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-5 diff --git a/test/CodeGen/X86/eh-frame-unreachable.ll b/test/CodeGen/X86/eh-frame-unreachable.ll new file mode 100644 index 0000000000000..a7abc8a057fb9 --- /dev/null +++ b/test/CodeGen/X86/eh-frame-unreachable.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; Test that we don't emit a row that extends beyond the FDE's range_size. +; +; CHECK: movq %rsp, %rbp +; CHECK-NEXT: .cfi_endproc +; CHECK-NOT: .cfi + +define void @f() #0 { + unreachable +} +attributes #0 = { "no-frame-pointer-elim"="true" } diff --git a/test/CodeGen/X86/empty-function.ll b/test/CodeGen/X86/empty-function.ll new file mode 100644 index 0000000000000..92bebd0ab1a7c --- /dev/null +++ b/test/CodeGen/X86/empty-function.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=LINUX %s + +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc18.0.0" + +; Don't emit empty functions on Windows; it can lead to duplicate entries +; (multiple functions sharing the same RVA) in the Guard CF Function Table which +; the kernel refuses to load. + +define void @f() { +entry: + unreachable + +; CHECK-LABEL: f: +; WIN32: nop +; WIN64: ud2 +; LINUX-NOT: nop +; LINUX-NOT: ud2 + +} diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll index 735df2a4196d5..0c139534e567d 100644 --- a/test/CodeGen/X86/empty-functions.ll +++ b/test/CodeGen/X86/empty-functions.ll @@ -23,8 +23,6 @@ entry: ; CHECK-FP-NEXT: : ; CHECK-FP-NEXT: .cfi_offset %rbp, -16 ; CHECK-FP-NEXT: movq %rsp, %rbp -; CHECK-FP-NEXT: : -; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp ; CHECK-FP-NEXT: .cfi_endproc ; An empty function is perfectly fine on ELF. @@ -35,9 +33,7 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: .cfi_startproc ; LINUX-FP-NEXT: {{^}}# @@ -48,7 +44,5 @@ entry: ; LINUX-FP-NEXT: .cfi_offset %rbp, -16 ; LINUX-FP-NEXT: movq %rsp, %rbp ; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp -; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .size func, .Lfunc_end0-func ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll index e36e33ffe66b7..228ce70b40097 100644 --- a/test/CodeGen/X86/extractelement-index.ll +++ b/test/CodeGen/X86/extractelement-index.ll @@ -320,7 +320,7 @@ define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind { define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v2i64_0: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v2i64_0: @@ -335,7 +335,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v2i64_1: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v2i64_1: @@ -355,7 +355,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_1: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v4i64_1: @@ -376,7 +376,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_3: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v4i64_3: diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index 5c481197c3b49..d68236e9d250e 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -7,7 +7,6 @@ target triple = "i386--netbsd" ; CHECK-LABEL: fn1 ; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: addl {{.*#+}} 4-byte Folded Reload ; CHECK: imull {{.*#+}} 4-byte Folded Reload ; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: retl diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index f7d4eb380d574..c3109673468ec 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -11,7 +11,7 @@ ; LIN: movdqa (%rsi), %xmm0 ; LIN: pand (%rdx), %xmm0 ; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; LIN: movd %xmm0, %r[[REG2:.+]] +; LIN: movq %xmm0, %r[[REG2:.+]] ; LIN: movslq %e[[REG2]], %r[[REG1:.+]] ; LIN: sarq $32, %r[[REG2]] ; LIN: movslq %e[[REG4]], %r[[REG3:.+]] @@ -24,7 +24,7 @@ ; WIN: movdqa (%rdx), %xmm0 ; WIN: pand (%r8), %xmm0 ; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; WIN: movd %xmm0, %r[[REG2:.+]] +; WIN: movq %xmm0, %r[[REG2:.+]] ; WIN: movslq %e[[REG2]], %r[[REG1:.+]] ; WIN: sarq $32, %r[[REG2]] ; WIN: movslq %e[[REG4]], %r[[REG3:.+]] diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll index a745f652d0653..7b2656897e0e8 100644 --- a/test/CodeGen/X86/i256-add.ll +++ b/test/CodeGen/X86/i256-add.ll @@ -12,34 +12,35 @@ define void @add(i256* %p, i256* %q) nounwind { ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %edx -; X32-NEXT: movl (%ecx), %ebx -; X32-NEXT: movl 4(%ecx), %edi +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ebx ; X32-NEXT: movl 28(%eax), %esi ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill ; X32-NEXT: movl 24(%eax), %ebp -; X32-NEXT: addl (%eax), %ebx -; X32-NEXT: adcl 4(%eax), %edi -; X32-NEXT: adcl 8(%eax), %edx +; X32-NEXT: addl (%eax), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: adcl 4(%eax), %ebx +; X32-NEXT: adcl 8(%eax), %edi +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %edi ; X32-NEXT: movl 12(%eax), %edx -; X32-NEXT: movl 16(%eax), %eax +; X32-NEXT: movl 16(%eax), %esi ; X32-NEXT: adcl 12(%ecx), %edx -; X32-NEXT: adcl 16(%ecx), %eax -; X32-NEXT: adcl 20(%ecx), %esi -; X32-NEXT: adcl 24(%ecx), %ebp -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: adcl 16(%ecx), %esi +; X32-NEXT: adcl 20(%ecx), %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl 24(%ecx), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload ; X32-NEXT: adcl %ebp, 28(%ecx) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 8(%ecx) +; X32-NEXT: movl %ebx, 4(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %edi, 4(%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%ecx) ; X32-NEXT: movl %edx, 12(%ecx) -; X32-NEXT: movl %eax, 16(%ecx) -; X32-NEXT: movl %esi, 20(%ecx) -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edi, 20(%ecx) ; X32-NEXT: movl %eax, 24(%ecx) ; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %esi @@ -58,9 +59,9 @@ define void @add(i256* %p, i256* %q) nounwind { ; X64-NEXT: adcq 8(%rsi), %rdx ; X64-NEXT: adcq 16(%rsi), %rax ; X64-NEXT: adcq %r8, 24(%rdi) -; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) ; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -96,9 +97,9 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X32-NEXT: sbbl 24(%esi), %eax ; X32-NEXT: movl 28(%esi), %esi ; X32-NEXT: sbbl %esi, 28(%ecx) -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %ebp, 4(%ecx) ; X32-NEXT: movl %edi, 8(%ecx) +; X32-NEXT: movl %ebp, 4(%ecx) +; X32-NEXT: movl %ebx, (%ecx) ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 12(%ecx) ; X32-NEXT: movl (%esp), %esi # 4-byte Reload @@ -122,9 +123,9 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X64-NEXT: sbbq 8(%rsi), %rdx ; X64-NEXT: sbbq 16(%rsi), %rax ; X64-NEXT: sbbq %r8, 24(%rdi) -; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) ; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll index 3da1a360e2904..f2fbff1431213 100644 --- a/test/CodeGen/X86/i64-to-float.ll +++ b/test/CodeGen/X86/i64-to-float.ll @@ -251,11 +251,11 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; X64-SSE-NEXT: pandn %xmm3, %xmm0 ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; X64-SSE-NEXT: por %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %rax +; X64-SSE-NEXT: movq %xmm1, %rax ; X64-SSE-NEXT: xorps %xmm0, %xmm0 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X64-SSE-NEXT: movd %xmm1, %rax +; X64-SSE-NEXT: movq %xmm1, %rax ; X64-SSE-NEXT: xorps %xmm1, %xmm1 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/test/CodeGen/X86/insertelement-duplicates.ll b/test/CodeGen/X86/insertelement-duplicates.ll new file mode 100644 index 0000000000000..b07343362144a --- /dev/null +++ b/test/CodeGen/X86/insertelement-duplicates.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 + +define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline { +; SSE-32-LABEL: PR15298: +; SSE-32: # BB#0: # %L.entry +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: movaps 304(%ecx), %xmm0 +; SSE-32-NEXT: xorps %xmm1, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-32-NEXT: movups %xmm1, 624(%eax) +; SSE-32-NEXT: movups %xmm0, 608(%eax) +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: PR15298: +; SSE-64: # BB#0: # %L.entry +; SSE-64-NEXT: movaps 304(%rdi), %xmm0 +; SSE-64-NEXT: xorps %xmm1, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-64-NEXT: movups %xmm1, 624(%rsi) +; SSE-64-NEXT: movups %xmm0, 608(%rsi) +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: PR15298: +; AVX-32: # BB#0: # %L.entry +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 +; AVX-32-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vmovups %ymm0, 608(%eax) +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: PR15298: +; AVX-64: # BB#0: # %L.entry +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 +; AVX-64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +L.entry: + %0 = getelementptr inbounds <4 x float>, <4 x float>* %source, i32 19 + %1 = load <4 x float>, <4 x float>* %0, align 16 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <8 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %2, i32 2 + %4 = insertelement <8 x float> %3, float %2, i32 1 + %5 = getelementptr <8 x float>, <8 x float>* %dest, i32 19 + store <8 x float> %4, <8 x float>* %5, align 4 + ret void +} diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll index ea38d9e4ec296..89e5f9481188e 100644 --- a/test/CodeGen/X86/isint.ll +++ b/test/CodeGen/X86/isint.ll @@ -1,8 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck %s -; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK64 %s ; PR19059 -; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK32 %s +; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK32 %s define i32 @isint_return(double %d) nounwind { ; CHECK-LABEL: isint_return: @@ -15,7 +14,8 @@ define i32 @isint_return(double %d) nounwind { %c = fcmp oeq double %d, %e ; CHECK32-NOT: movd {{.*}}, %r{{.*}} ; CHECK32-NOT: andq -; CHECK-NEXT: movd +; CHECK32-NEXT: movd +; CHECK64-NEXT: movq ; CHECK-NEXT: andl %z = zext i1 %c to i32 ret i32 %z diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index 62020c2d19146..79f90f49c7c6b 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -44,16 +44,16 @@ define double @test2(double %A, double %B) { define i64 @test3(i64 %A) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: -; CHECK-NEXT: movd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test3: ; CHECK-WIDE: # BB#0: -; CHECK-WIDE-NEXT: movd %rdi, %xmm0 +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 ; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movd %xmm0, %rax +; CHECK-WIDE-NEXT: movq %xmm0, %rax ; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x float> %add = fadd <2 x float> %1, <float 3.0, float 5.0> @@ -67,18 +67,18 @@ define i64 @test3(i64 %A) { define i64 @test4(i64 %A) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: -; CHECK-NEXT: movd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test4: ; CHECK-WIDE: # BB#0: -; CHECK-WIDE-NEXT: movd %rdi, %xmm0 +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 ; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movd %xmm0, %rax +; CHECK-WIDE-NEXT: movq %xmm0, %rax ; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x i32> %add = add <2 x i32> %1, <i32 3, i32 5> diff --git a/test/CodeGen/X86/memcpy-struct-by-value.ll b/test/CodeGen/X86/memcpy-struct-by-value.ll new file mode 100644 index 0000000000000..2e7a64d84000d --- /dev/null +++ b/test/CodeGen/X86/memcpy-struct-by-value.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32 +; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; FIXME: The documentation states that ivybridge has ermsb, but this is not +; enabled right now since I could not confirm by testing. +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST + +%struct.large = type { [4096 x i8] } + +declare void @foo(%struct.large* align 8 byval) nounwind + +define void @test1(%struct.large* nocapture %x) nounwind { + call void @foo(%struct.large* align 8 byval %x) + ret void + +; ALL-LABEL: test1: +; NOFAST: rep;movsq +; NOFAST32: rep;movsl +; FAST: rep;movsb +} + +define void @test2(%struct.large* nocapture %x) nounwind minsize { + call void @foo(%struct.large* align 8 byval %x) + ret void + +; ALL-LABEL: test2: +; NOFAST: rep;movsq +; NOFAST32: rep;movsl +; FAST: rep;movsb +} + +%struct.large_oddsize = type { [4095 x i8] } + +declare void @foo_oddsize(%struct.large_oddsize* align 8 byval) nounwind + +define void @test3(%struct.large_oddsize* nocapture %x) nounwind minsize { + call void @foo_oddsize(%struct.large_oddsize* align 8 byval %x) + ret void + +; ALL-LABEL: test3: +; NOFAST: rep;movsb +; NOFAST32: rep;movsb +; FAST: rep;movsb +} diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll index dcb7bd010e56b..f4c4c6d360676 100644 --- a/test/CodeGen/X86/merge_store.ll +++ b/test/CodeGen/X86/merge_store.ll @@ -29,17 +29,8 @@ entry: ret void } - - ;; CHECK-LABEL: indexed-store-merge - -;; We should be able to merge the 4 consecutive stores. -;; FIXMECHECK: movl $0, 2(%rsi,%rdi) - -;; CHECK: movb $0, 2(%rsi,%rdi) -;; CHECK: movb $0, 3(%rsi,%rdi) -;; CHECK: movb $0, 4(%rsi,%rdi) -;; CHECK: movb $0, 5(%rsi,%rdi) +;; CHECK: movl $0, 2(%rsi,%rdi) ;; CHECK: movb $0, (%rsi) define void @indexed-store-merge(i64 %p, i8* %v) { entry: diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll index 9128e5cb4c9de..30cf474dc38b7 100644 --- a/test/CodeGen/X86/mmx-bitcast.ll +++ b/test/CodeGen/X86/mmx-bitcast.ll @@ -80,7 +80,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone { ; CHECK-NEXT: movd %esi, %xmm0 ; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: retq %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll index 8f2da95353993..fd6c5081b5a35 100644 --- a/test/CodeGen/X86/mmx-cvt.ll +++ b/test/CodeGen/X86/mmx-cvt.ll @@ -347,7 +347,7 @@ define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind { ; X64-NEXT: movq (%rdi), %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movd %mm0, %rax -; X64-NEXT: movd %rax, %xmm0 +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %2 = bitcast <1 x i64>* %0 to x86_mmx* diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll index 4fdee11ec83a1..ae28fab9bb629 100644 --- a/test/CodeGen/X86/mod128.ll +++ b/test/CodeGen/X86/mod128.ll @@ -18,7 +18,7 @@ define i64 @mod128(i128 %x) { ; WIN64-DAG: movq $0, 40(%rsp) ; WIN64-DAG: movq $3, 32(%rsp) ; WIN64: callq __modti3 - ; WIN64: movd %xmm0, %rax + ; WIN64: movq %xmm0, %rax %1 = srem i128 %x, 3 %2 = trunc i128 %1 to i64 diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll index 1caa22a15947e..e40f64eb39b21 100644 --- a/test/CodeGen/X86/movmsk.ll +++ b/test/CodeGen/X86/movmsk.ll @@ -100,7 +100,7 @@ entry: define void @float_call_signbit(double %n) { ; CHECK-LABEL: float_call_signbit: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movd %xmm0, %rdi +; CHECK-NEXT: movq %xmm0, %rdi ; CHECK-NEXT: shrq $63, %rdi ; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<kill> ; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll index d1bb8d3e923b6..337e625df1683 100644 --- a/test/CodeGen/X86/nontemporal-2.ll +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -596,14 +596,14 @@ define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { ; SSE2-LABEL: test_extract_i64: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movntiq %rax, (%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_extract_i64: ; SSE4A: # BB#0: ; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE4A-NEXT: movd %xmm0, %rax +; SSE4A-NEXT: movq %xmm0, %rax ; SSE4A-NEXT: movntiq %rax, (%rdi) ; SSE4A-NEXT: retq ; diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir new file mode 100644 index 0000000000000..ba5c85922c7ab --- /dev/null +++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir @@ -0,0 +1,322 @@ +# RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=btver2 -run-pass=post-RA-sched -o - %s | FileCheck %s + +# Test that multiple DBG_VALUE's following an instruction whose register needs +# to be changed during the post-RA scheduler pass are updated correctly. + +# Test case was derived from the output from the following command and +# the source code below: +# +# clang -S -emit-llvm -target x86_64 -march=btver2 -O2 -g -o - <srcfile> | +# llc -stop-before=post-RA-sched -o - +# +# Source code reduced from the original 8MB source file: +# +# struct a; +# class b { +# public: +# a *c = ap; +# unsigned *d() { return (unsigned *)c; } +# a *ap; +# }; +# enum { e = 2 }; +# template <typename f> f *g(f *h, f *i) { +# long j = long(i), k = -!h; +# return reinterpret_cast<f *>(long(h) | k & j); +# } +# class l { +# public: +# l(int); +# int m; +# }; +# unsigned *n; +# unsigned o; +# class p { +# public: +# int aa(); +# unsigned *q() { +# n = r.d(); +# return g(n, &o); +# } +# b r; +# }; +# class s : l { +# public: +# p t; +# s(int h) : l(h), ab(t), ac(~0 << h) { ae(); } +# p &ab; +# int ac; +# void ae() { +# const unsigned *v; +# const unsigned u = 0; +# v = ab.q(); +# const unsigned *x = g(v, &u); +# int w = x[m] & ac; +# while (w) { +# int z = (ab.aa() - 1) / e; +# if (m <= z) +# return; +# } +# } +# }; +# class ad { +# public: +# ~ad() { +# for (y();;) +# ; +# } +# class y { +# public: +# y() : af(0) {} +# s af; +# }; +# }; +# class ag { +# ad ah; +# }; +# enum ai {}; +# class aj { +# public: +# aj(unsigned(ai)); +# ag ak; +# }; +# struct al { +# static unsigned am(ai); +# }; +# template <int> struct an : al { static aj ao; }; +# template <> aj an<0>::ao(am); + +--- | + + %class.s = type <{ %class.l, [4 x i8], %class.p, %class.p*, i32, [4 x i8] }> + %class.l = type { i32 } + %class.p = type { %class.b } + %class.b = type { %struct.a*, %struct.a* } + %struct.a = type opaque + + @n = local_unnamed_addr global i32* null, align 8 + @o = global i32 0, align 4 + + define linkonce_odr void @_ZN1sC2Ei(%class.s*, i32) unnamed_addr #0 align 2 !dbg !4 { + %3 = alloca i32, align 4 + %4 = bitcast %class.s* %0 to %class.l* + tail call void @_ZN1lC2Ei(%class.l* %4, i32 %1) + %5 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2 + tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !10, metadata !17), !dbg !18 + tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !20, metadata !17), !dbg !27 + %6 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2, i32 0, i32 1 + %7 = bitcast %struct.a** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = bitcast %class.p* %5 to i64* + store i64 %8, i64* %9, align 8 + %10 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 3 + store %class.p* %5, %class.p** %10, align 8 + %11 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 4 + %12 = shl i32 -1, %1 + store i32 %12, i32* %11, align 8 + store i32 0, i32* %3, align 4 + %13 = bitcast %class.p* %5 to i32** + %14 = load i32*, i32** %13, align 8 + store i32* %14, i32** @n, align 8 + %15 = icmp eq i32* %14, null + %16 = ptrtoint i32* %14 to i64 + %17 = select i1 %15, i64 ptrtoint (i32* @o to i64), i64 0 + %18 = or i64 %17, %16 + tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !29, metadata !35), !dbg !36 + tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !39, metadata !17), !dbg !44 + %19 = ptrtoint i32* %3 to i64 + call void @llvm.dbg.value(metadata i64 %19, i64 0, metadata !46, metadata !17), !dbg !48 + %20 = icmp eq i64 %18, 0 + %21 = select i1 %20, i64 %19, i64 0 + %22 = or i64 %21, %18 + %23 = inttoptr i64 %22 to i32* + %24 = bitcast %class.s* %0 to i32* + %25 = load i32, i32* %24, align 8 + %26 = sext i32 %25 to i64 + %27 = getelementptr inbounds i32, i32* %23, i64 %26 + %28 = load i32, i32* %27, align 4 + %29 = and i32 %12, %28 + %30 = icmp eq i32 %29, 0 + br i1 %30, label %47, label %31 + + ; <label>:31: ; preds = %2 + %32 = bitcast %class.s* %0 to i32* + %33 = call i32 @_ZN1p2aaEv(%class.p* %5) + %34 = add nsw i32 %33, -1 + %35 = sdiv i32 %34, 2 + %36 = load i32, i32* %32, align 8 + %37 = icmp sgt i32 %36, %35 + br i1 %37, label %38, label %47 + + ; <label>:38: ; preds = %31 + br label %39 + + ; <label>:39: ; preds = %39, %38 + %40 = bitcast %class.s* %0 to i32* + %sunkaddr = ptrtoint %class.s* %0 to i64 + %sunkaddr1 = add i64 %sunkaddr, 24 + %sunkaddr2 = inttoptr i64 %sunkaddr1 to %class.p** + %41 = load %class.p*, %class.p** %sunkaddr2, align 8 + %42 = call i32 @_ZN1p2aaEv(%class.p* %41) + %43 = add nsw i32 %42, -1 + %44 = sdiv i32 %43, 2 + %45 = load i32, i32* %40, align 8 + %46 = icmp sgt i32 %45, %44 + br i1 %46, label %39, label %47 + + ; <label>:47: ; preds = %39, %31, %2 + ret void + } + + declare void @_ZN1lC2Ei(%class.l*, i32) unnamed_addr #1 + + declare i32 @_ZN1p2aaEv(%class.p*) local_unnamed_addr #1 + + ; Function Attrs: nounwind readnone + declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2, !3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !1 = !DIFile(filename: "test.cpp", directory: "") + !2 = !{i32 2, !"Dwarf Version", i32 4} + !3 = !{i32 2, !"Debug Info Version", i32 3} + !4 = distinct !DISubprogram(name: "s", linkageName: "_ZN1sC2Ei", scope: !5, file: !1, line: 32, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0) + !5 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "s", file: !1, line: 29, size: 320, identifier: "_ZTS1s") + !6 = !DISubroutineType(types: !7) + !7 = !{null, !8, !9} + !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !10 = !DILocalVariable(name: "this", arg: 1, scope: !11, type: !16, flags: DIFlagArtificial | DIFlagObjectPointer) + !11 = distinct !DISubprogram(name: "p", linkageName: "_ZN1pC2Ev", scope: !12, file: !1, line: 20, type: !13, isLocal: false, isDefinition: true, scopeLine: 20, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: true, unit: !0) + !12 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "p", file: !1, line: 20, size: 128, identifier: "_ZTS1p") + !13 = !DISubroutineType(types: !14) + !14 = !{null, !15} + !15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) + !17 = !DIExpression() + !18 = !DILocation(line: 0, scope: !11, inlinedAt: !19) + !19 = distinct !DILocation(line: 32, column: 3, scope: !4) + !20 = !DILocalVariable(name: "this", arg: 1, scope: !21, type: !26, flags: DIFlagArtificial | DIFlagObjectPointer) + !21 = distinct !DISubprogram(name: "b", linkageName: "_ZN1bC2Ev", scope: !22, file: !1, line: 2, type: !23, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: true, unit: !0) + !22 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "b", file: !1, line: 2, size: 128, identifier: "_ZTS1b") + !23 = !DISubroutineType(types: !24) + !24 = !{null, !25} + !25 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64) + !27 = !DILocation(line: 0, scope: !21, inlinedAt: !28) + !28 = distinct !DILocation(line: 20, column: 7, scope: !11, inlinedAt: !19) + !29 = !DILocalVariable(name: "u", scope: !30, file: !1, line: 37, type: !33) + !30 = distinct !DISubprogram(name: "ae", linkageName: "_ZN1s2aeEv", scope: !5, file: !1, line: 35, type: !31, isLocal: false, isDefinition: true, scopeLine: 35, flags: DIFlagPrototyped, isOptimized: true, unit: !0) + !31 = !DISubroutineType(types: !32) + !32 = !{null, !8} + !33 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !34) + !34 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + !35 = !DIExpression(DW_OP_deref) + !36 = !DILocation(line: 37, column: 20, scope: !30, inlinedAt: !37) + !37 = distinct !DILocation(line: 32, column: 41, scope: !38) + !38 = distinct !DILexicalBlock(scope: !4, file: !1, line: 32, column: 39) + !39 = !DILocalVariable(name: "i", arg: 2, scope: !40, file: !1, line: 9, type: !43) + !40 = distinct !DISubprogram(name: "g<const unsigned int>", linkageName: "_Z1gIKjEPT_S2_S2_", scope: !1, file: !1, line: 9, type: !41, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0) + !41 = !DISubroutineType(types: !42) + !42 = !{!43, !43, !43} + !43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !33, size: 64) + !44 = !DILocation(line: 9, column: 37, scope: !40, inlinedAt: !45) + !45 = distinct !DILocation(line: 39, column: 25, scope: !30, inlinedAt: !37) + !46 = !DILocalVariable(name: "j", scope: !40, file: !1, line: 10, type: !47) + !47 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) + !48 = !DILocation(line: 10, column: 8, scope: !40, inlinedAt: !45) + +... +--- +name: _ZN1sC2Ei +tracksRegLiveness: true +liveins: + - { reg: '%rdi' } + - { reg: '%esi' } +fixedStack: + - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '%rbx' } + - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%r14' } + - { id: 2, type: spill-slot, offset: -16, size: 8, alignment: 16 } +stack: + - { id: 0, offset: -36, size: 4, alignment: 4 } +body: | + bb.0: + successors: %bb.3, %bb.2 + liveins: %esi, %rdi, %r14, %rbx, %rbp + + ; CHECK: [[REGISTER:%r[a-z0-9]+]] = LEA64r {{%r[a-z0-9]+}}, 1, _, -20, _ + ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !46, !17, debug-location !48 + ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !39, !17, debug-location !44 + + frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp + CFI_INSTRUCTION def_cfa_offset 16 + CFI_INSTRUCTION offset %rbp, -16 + %rbp = frame-setup MOV64rr %rsp + CFI_INSTRUCTION def_cfa_register %rbp + frame-setup PUSH64r killed %r14, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp + %rsp = frame-setup SUB64ri8 %rsp, 16, implicit-def dead %eflags + CFI_INSTRUCTION offset %rbx, -32 + CFI_INSTRUCTION offset %r14, -24 + %r14d = MOV32rr %esi + %rbx = MOV64rr %rdi + CALL64pcrel32 @_ZN1lC2Ei, csr_64, implicit %rsp, implicit %rdi, implicit %esi, implicit-def %rsp + %rdi = LEA64r %rbx, 1, _, 8, _ + DBG_VALUE debug-use %rdi, debug-use _, !20, !17, debug-location !27 + DBG_VALUE debug-use %rdi, debug-use _, !10, !17, debug-location !18 + %rax = MOV64rm %rbx, 1, _, 16, _ :: (load 8) + MOV64mr %rbx, 1, _, 8, _, killed %rax :: (store 8) + MOV64mr %rbx, 1, _, 24, _, %rdi :: (store 8) + %eax = MOV32ri -1 + %cl = MOV8rr %r14b, implicit killed %r14d + %eax = SHL32rCL killed %eax, implicit-def dead %eflags, implicit %cl + MOV32mr %rbx, 1, _, 32, _, %eax :: (store 4, align 8) + MOV32mi %rbp, 1, _, -20, _, 0 :: (store 4) + %rcx = MOV64rm %rbx, 1, _, 8, _ :: (load 8) + MOV64mr %rip, 1, _, @n, _, %rcx :: (store 8) + %edx = XOR32rr undef %edx, undef %edx, implicit-def dead %eflags, implicit-def %rdx + TEST64rr %rcx, %rcx, implicit-def %eflags + %esi = MOV32ri @o, implicit-def %rsi + %rsi = CMOVNE64rr killed %rsi, %rdx, implicit killed %eflags + %rsi = OR64rr killed %rsi, killed %rcx, implicit-def %eflags + %rcx = LEA64r %rbp, 1, _, -20, _ + DBG_VALUE debug-use %rcx, debug-use _, !46, !17, debug-location !48 + DBG_VALUE debug-use %rcx, debug-use _, !39, !17, debug-location !44 + DBG_VALUE %rbp, -20, !29, !17, debug-location !36 + %rcx = CMOVNE64rr killed %rcx, killed %rdx, implicit killed %eflags + %rcx = OR64rr killed %rcx, killed %rsi, implicit-def dead %eflags + %rdx = MOVSX64rm32 %rbx, 1, _, 0, _ :: (load 4, align 8) + TEST32rm killed %eax, killed %rcx, 4, killed %rdx, 0, _, implicit-def %eflags :: (load 4) + JNE_1 %bb.2, implicit %eflags + JMP_1 %bb.3 + + bb.1: + successors: %bb.2 + liveins: %rbx, %rbp + + %rdi = MOV64rm %rbx, 1, _, 24, _ :: (load 8) + + bb.2: + successors: %bb.1, %bb.3 + liveins: %rbx, %rbp, %rsp, %rdi + + CALL64pcrel32 @_ZN1p2aaEv, csr_64, implicit %rsp, implicit %rdi, implicit-def %rsp, implicit-def %eax + %eax = KILL %eax, implicit-def %rax + %ecx = LEA64_32r %rax, 1, _, -1, _, implicit-def %rcx + %ecx = SHR32ri %ecx, 31, implicit-def dead %eflags, implicit killed %rcx, implicit-def %rcx + %eax = LEA64_32r killed %rax, 1, killed %rcx, -1, _ + %eax = SAR32r1 killed %eax, implicit-def dead %eflags + CMP32mr %rbx, 1, _, 0, _, killed %eax, implicit-def %eflags :: (load 4, align 8), (load 4, align 8) + JG_1 %bb.1, implicit killed %eflags + + bb.3: + liveins: %rbp + + %rsp = ADD64ri8 %rsp, 16, implicit-def dead %eflags + %rbx = POP64r implicit-def %rsp, implicit %rsp + %r14 = POP64r implicit-def %rsp, implicit %rsp + %rbp = POP64r implicit-def %rsp, implicit %rsp + RETQ + +... diff --git a/test/CodeGen/X86/pr14657.ll b/test/CodeGen/X86/pr14657.ll new file mode 100644 index 0000000000000..cc7d3e068d4aa --- /dev/null +++ b/test/CodeGen/X86/pr14657.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +; PR14657 - avoid truncation/extension of comparison results + +@da = common global [1024 x float] zeroinitializer, align 32 +@db = common global [1024 x float] zeroinitializer, align 32 +@dc = common global [1024 x float] zeroinitializer, align 32 +@dd = common global [1024 x float] zeroinitializer, align 32 +@dj = common global [1024 x i32] zeroinitializer, align 32 + +define void @_Z9example25v() nounwind uwtable noinline ssp { +; SSE2-LABEL: _Z9example25v: +; SSE2: # BB#0: # %vector.ph +; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB0_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movaps da+4096(%rax), %xmm1 +; SSE2-NEXT: movaps da+4112(%rax), %xmm2 +; SSE2-NEXT: cmpltps db+4112(%rax), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: cmpltps db+4096(%rax), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movaps dc+4096(%rax), %xmm2 +; SSE2-NEXT: movaps dc+4112(%rax), %xmm3 +; SSE2-NEXT: cmpltps dd+4112(%rax), %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: cmpltps dd+4096(%rax), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax) +; SSE2-NEXT: movdqa %xmm1, dj+4096(%rax) +; SSE2-NEXT: addq $32, %rax +; SSE2-NEXT: jne .LBB0_1 +; SSE2-NEXT: # BB#2: # %for.end +; SSE2-NEXT: retq +; +; SSE41-LABEL: _Z9example25v: +; SSE41: # BB#0: # %vector.ph +; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: .p2align 4, 0x90 +; SSE41-NEXT: .LBB0_1: # %vector.body +; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE41-NEXT: movaps da+4096(%rax), %xmm2 +; SSE41-NEXT: movaps da+4112(%rax), %xmm3 +; SSE41-NEXT: cmpltps db+4112(%rax), %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: cmpltps db+4096(%rax), %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: psllw $15, %xmm2 +; SSE41-NEXT: psraw $15, %xmm2 +; SSE41-NEXT: movaps dc+4096(%rax), %xmm3 +; SSE41-NEXT: movaps dc+4112(%rax), %xmm4 +; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm4 +; SSE41-NEXT: pshufb %xmm0, %xmm4 +; SSE41-NEXT: cmpltps dd+4096(%rax), %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE41-NEXT: psllw $15, %xmm3 +; SSE41-NEXT: psraw $15, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, dj+4112(%rax) +; SSE41-NEXT: movdqa %xmm2, dj+4096(%rax) +; SSE41-NEXT: addq $32, %rax +; SSE41-NEXT: jne .LBB0_1 +; SSE41-NEXT: # BB#2: # %for.end +; SSE41-NEXT: retq +; +; AVX1-LABEL: _Z9example25v: +; AVX1: # BB#0: # %vector.ph +; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB0_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vmovups da+4096(%rax), %ymm1 +; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX1-NEXT: vmovups dc+4096(%rax), %ymm2 +; AVX1-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovups %ymm1, dj+4096(%rax) +; AVX1-NEXT: addq $32, %rax +; AVX1-NEXT: jne .LBB0_1 +; AVX1-NEXT: # BB#2: # %for.end +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: _Z9example25v: +; AVX2: # BB#0: # %vector.ph +; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm0 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB0_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovups da+4096(%rax), %ymm1 +; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2 +; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 +; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovups %ymm1, dj+4096(%rax) +; AVX2-NEXT: addq $32, %rax +; AVX2-NEXT: jne .LBB0_1 +; AVX2-NEXT: # BB#2: # %for.end +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %index + %1 = bitcast float* %0 to <8 x float>* + %2 = load <8 x float>, <8 x float>* %1, align 16 + %3 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %index + %4 = bitcast float* %3 to <8 x float>* + %5 = load <8 x float>, <8 x float>* %4, align 16 + %6 = fcmp olt <8 x float> %2, %5 + %7 = getelementptr inbounds [1024 x float], [1024 x float]* @dc, i64 0, i64 %index + %8 = bitcast float* %7 to <8 x float>* + %9 = load <8 x float>, <8 x float>* %8, align 16 + %10 = getelementptr inbounds [1024 x float], [1024 x float]* @dd, i64 0, i64 %index + %11 = bitcast float* %10 to <8 x float>* + %12 = load <8 x float>, <8 x float>* %11, align 16 + %13 = fcmp olt <8 x float> %9, %12 + %14 = and <8 x i1> %6, %13 + %15 = zext <8 x i1> %14 to <8 x i32> + %16 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %index + %17 = bitcast i32* %16 to <8 x i32>* + store <8 x i32> %15, <8 x i32>* %17, align 16 + %index.next = add i64 %index, 8 + %18 = icmp eq i64 %index.next, 1024 + br i1 %18, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void +} + +define void @_Z9example24ss(i16 signext %x, i16 signext %y) nounwind uwtable noinline ssp { +; SSE2-LABEL: _Z9example24ss: +; SSE2: # BB#0: # %vector.ph +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB1_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movaps da+4096(%rax), %xmm2 +; SSE2-NEXT: movaps da+4112(%rax), %xmm3 +; SSE2-NEXT: cmpltps db+4112(%rax), %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: cmpltps db+4096(%rax), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax) +; SSE2-NEXT: movdqa %xmm3, dj+4096(%rax) +; SSE2-NEXT: addq $32, %rax +; SSE2-NEXT: jne .LBB1_1 +; SSE2-NEXT: # BB#2: # %for.end +; SSE2-NEXT: retq +; +; SSE41-LABEL: _Z9example24ss: +; SSE41: # BB#0: # %vector.ph +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: .p2align 4, 0x90 +; SSE41-NEXT: .LBB1_1: # %vector.body +; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE41-NEXT: movaps da+4096(%rax), %xmm3 +; SSE41-NEXT: movaps da+4112(%rax), %xmm4 +; SSE41-NEXT: cmpltps db+4112(%rax), %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: cmpltps db+4096(%rax), %xmm3 +; SSE41-NEXT: pshufb %xmm2, %xmm3 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 +; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax) +; SSE41-NEXT: movdqa %xmm4, dj+4112(%rax) +; SSE41-NEXT: addq $32, %rax +; SSE41-NEXT: jne .LBB1_1 +; SSE41-NEXT: # BB#2: # %for.end +; SSE41-NEXT: retq +; +; AVX1-LABEL: _Z9example24ss: +; AVX1: # BB#0: # %vector.ph +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB1_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vmovups da+4096(%rax), %ymm2 +; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax) +; AVX1-NEXT: addq $32, %rax +; AVX1-NEXT: jne .LBB1_1 +; AVX1-NEXT: # BB#2: # %for.end +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: _Z9example24ss: +; AVX2: # BB#0: # %vector.ph +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %esi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB1_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovups da+4096(%rax), %ymm2 +; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, dj+4096(%rax) +; AVX2-NEXT: addq $32, %rax +; AVX2-NEXT: jne .LBB1_1 +; AVX2-NEXT: # BB#2: # %for.end +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +vector.ph: + %0 = insertelement <8 x i16> undef, i16 %x, i32 0 + %broadcast11 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast12 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %index + %3 = bitcast float* %2 to <8 x float>* + %4 = load <8 x float>, <8 x float>* %3, align 16 + %5 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %index + %6 = bitcast float* %5 to <8 x float>* + %7 = load <8 x float>, <8 x float>* %6, align 16 + %8 = fcmp olt <8 x float> %4, %7 + %9 = select <8 x i1> %8, <8 x i16> %broadcast11, <8 x i16> %broadcast12 + %10 = sext <8 x i16> %9 to <8 x i32> + %11 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %index + %12 = bitcast i32* %11 to <8 x i32>* + store <8 x i32> %10, <8 x i32>* %12, align 16 + %index.next = add i64 %index, 8 + %13 = icmp eq i64 %index.next, 1024 + br i1 %13, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void +} diff --git a/test/CodeGen/X86/pr18344.ll b/test/CodeGen/X86/pr18344.ll index 15bf91031ee88..fcf4174ec3d3b 100644 --- a/test/CodeGen/X86/pr18344.ll +++ b/test/CodeGen/X86/pr18344.ll @@ -36,7 +36,7 @@ define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noa ; X64: # BB#0: # %begin ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: pslld $4, %xmm0 -; X64-NEXT: movd %xmm0, %rax +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: movslq %eax, %r8 ; X64-NEXT: sarq $32, %rax ; X64-NEXT: pextrq $1, %xmm0, %rdx diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll index 54f9cb310dd37..84b7467e6a17f 100644 --- a/test/CodeGen/X86/pr21792.ll +++ b/test/CodeGen/X86/pr21792.ll @@ -16,7 +16,7 @@ define void @func(<4 x float> %vx) { ; CHECK-NEXT: pextrq $1, %xmm0, %rdx ; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: movq %rax, %r9 ; CHECK-NEXT: shrq $32, %r9 ; CHECK-NEXT: andl $2032, %eax # imm = 0x7F0 diff --git a/test/CodeGen/X86/pr22970.ll b/test/CodeGen/X86/pr22970.ll new file mode 100644 index 0000000000000..38c063355f647 --- /dev/null +++ b/test/CodeGen/X86/pr22970.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +define i32 @PR22970_i32(i32* nocapture readonly, i32) { +; X86-LABEL: PR22970_i32: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $4095, %ecx # imm = 0xFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 32(%eax,%ecx,4), %eax +; X86-NEXT: retl +; +; X64-LABEL: PR22970_i32: +; X64: # BB#0: +; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; X64-NEXT: andl $4095, %esi # imm = 0xFFF +; X64-NEXT: movl 32(%rdi,%rsi,4), %eax +; X64-NEXT: retq + %3 = and i32 %1, 4095 + %4 = add nuw nsw i32 %3, 8 + %5 = zext i32 %4 to i64 + %6 = getelementptr inbounds i32, i32* %0, i64 %5 + %7 = load i32, i32* %6, align 4 + ret i32 %7 +} + +define i32 @PR22970_i64(i32* nocapture readonly, i64) { +; X86-LABEL: PR22970_i64: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $4095, %ecx # imm = 0xFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 32(%eax,%ecx,4), %eax +; X86-NEXT: retl +; +; X64-LABEL: PR22970_i64: +; X64: # BB#0: +; X64-NEXT: andl $4095, %esi # imm = 0xFFF +; X64-NEXT: movl 32(%rdi,%rsi,4), %eax +; X64-NEXT: retq + %3 = and i64 %1, 4095 + %4 = add nuw nsw i64 %3, 8 + %5 = getelementptr inbounds i32, i32* %0, i64 %4 + %6 = load i32, i32* %5, align 4 + ret i32 %6 +} diff --git a/test/CodeGen/X86/pr30511.ll b/test/CodeGen/X86/pr30511.ll index 053ae013b4515..3c512ba270091 100644 --- a/test/CodeGen/X86/pr30511.ll +++ b/test/CodeGen/X86/pr30511.ll @@ -11,7 +11,7 @@ define i64 @PR30511(<2 x double> %a) { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq %1 = fadd <2 x double> %a, <double 0x4338000000000000, double 0x4338000000000000> %2 = bitcast <2 x double> %1 to <2 x i64> diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll index d447bf9b9b8cb..178fe3357d433 100644 --- a/test/CodeGen/X86/pshufb-mask-comments.ll +++ b/test/CodeGen/X86/pshufb-mask-comments.ll @@ -55,7 +55,7 @@ define <16 x i8> @test5(<16 x i8> %V) { ; CHECK-LABEL: test5: ; CHECK: # BB#0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movd %rax, %xmm1 +; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%rax) ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; CHECK-NEXT: movdqa %xmm1, (%rax) diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll index 758aa462f5137..65c3ac0cc447f 100644 --- a/test/CodeGen/X86/ret-mmx.ll +++ b/test/CodeGen/X86/ret-mmx.ll @@ -33,7 +33,7 @@ define <2 x i32> @t3() nounwind { ; CHECK-LABEL: t3: ; CHECK: ## BB#0: ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movd %rax, %xmm0 +; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: retq ret <2 x i32> <i32 1, i32 0> } diff --git a/test/CodeGen/X86/sad_variations.ll b/test/CodeGen/X86/sad_variations.ll index 1d826cf41a4d0..04fda5ed87740 100644 --- a/test/CodeGen/X86/sad_variations.ll +++ b/test/CodeGen/X86/sad_variations.ll @@ -206,7 +206,7 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad8_64bit_icmp_sext_slt: @@ -255,7 +255,7 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad8_64bit_icmp_zext_slt: @@ -304,7 +304,7 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt: diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll index 2b19d02ba8b57..c99d3494b8ee3 100644 --- a/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/test/CodeGen/X86/scalar-int-to-fp.ll @@ -536,7 +536,7 @@ define double @u64_to_d(i64 %a) nounwind { ; ; SSE2_64-LABEL: u64_to_d: ; SSE2_64: # BB#0: -; SSE2_64-NEXT: movd %rdi, %xmm1 +; SSE2_64-NEXT: movq %rdi, %xmm1 ; SSE2_64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE2_64-NEXT: subpd {{.*}}(%rip), %xmm1 ; SSE2_64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] diff --git a/test/CodeGen/X86/setcc-combine.ll b/test/CodeGen/X86/setcc-combine.ll index c6ad5e0031edb..38205c660731f 100644 --- a/test/CodeGen/X86/setcc-combine.ll +++ b/test/CodeGen/X86/setcc-combine.ll @@ -1,166 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic < %s | FileCheck %s define i32 @test_eq_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_eq_1: -; CHECK: pcmpgtd %xmm0, %xmm1 -; CHECK-NEXT: pxor {{.*}}(%rip), %xmm1 -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm0, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp eq <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_ne_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_ne_1: -; CHECK: pcmpgtd %xmm0, %xmm1 -; CHECK-NOT: pxor -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp ne <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_le_1: -; CHECK: movl $-1, %eax -; CHECK-NEXT: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sle <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_ge_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_ge_1: -; CHECK: pcmpgtd %xmm0, %xmm1 -; CHECK: pxor {{.*}}(%rip), %xmm1 -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm0, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sge <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_lt_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_lt_1: -; CHECK: pcmpgtd %xmm0, %xmm1 -; CHECK-NOT: pxor -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp slt <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_gt_1: -; CHECK: xorl %eax, %eax -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sgt <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_eq_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_eq_2: -; CHECK: pcmpgtd %xmm1, %xmm0 -; CHECK-NEXT: pxor {{.*}}(%rip), %xmm0 -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp eq <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_ne_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_ne_2: -; CHECK: pcmpgtd %xmm1, %xmm0 -; CHECK-NOT: pxor -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp ne <4 x i32> %sext, zeroinitializer - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_le_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_le_2: -; CHECK: pcmpgtd %xmm1, %xmm0 -; CHECK: pxor {{.*}}(%rip), %xmm0 -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sle <4 x i32> zeroinitializer, %sext - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_ge_2: -; CHECK: movl $-1, %eax -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sge <4 x i32> zeroinitializer, %sext - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_lt_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_lt_2: -; CHECK: pcmpgtd %xmm1, %xmm0 -; CHECK-NOT: pxor -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp slt <4 x i32> zeroinitializer, %sext - %0 = extractelement <4 x i1> %cmp, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } define i32 @test_gt_2(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: test_gt_2: -; CHECK: pcmpgtd %xmm1, %xmm0 -; CHECK-NOT: pxor -; CHECK: retq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sgt <4 x i32> zeroinitializer, %sext - %0 = extractelement <4 x i1> %cmp1, i32 1 - %1 = sext i1 %0 to i32 - ret i32 %1 + %t0 = extractelement <4 x i1> %cmp1, i32 1 + %t1 = sext i1 %t0 to i32 + ret i32 %t1 } + diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index b4ec03598aa4e..2996edaec3e0e 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,17 +58,17 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm4, %r9 -; SSE2-NEXT: movd %xmm0, %r10 -; SSE2-NEXT: movd %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %r9 +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rdi +; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm2, %rcx -; SSE2-NEXT: movd %xmm3, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movq %xmm3, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: xorq %r10, %rcx ; SSE2-NEXT: orq %rdx, %rcx @@ -100,17 +100,17 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm4, %r9 -; SSE2-NEXT: movd %xmm0, %r10 -; SSE2-NEXT: movd %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %r9 +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rdi +; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm2, %rcx -; SSE2-NEXT: movd %xmm3, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movq %xmm3, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: xorq %r10, %rcx ; SSE2-NEXT: orq %rdx, %rcx diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 930af226b9535..d5cd8b0525dd5 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -801,7 +801,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 -; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pmuludq %xmm1, %xmm2 @@ -839,7 +839,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 -; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pmuludq %xmm1, %xmm2 diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll index dfd9c0b0b3029..54de15c292f60 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll @@ -16,7 +16,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind { ; X64-LABEL: test_mm_cvtsi128_si64: ; X64: # BB#0: -; X64-NEXT: movd %xmm0, %rax +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: retq %res = extractelement <2 x i64> %a0, i32 0 ret i64 %res @@ -35,7 +35,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind { ; X64-LABEL: test_mm_cvtsi64_si128: ; X64: # BB#0: -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: retq %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0 %res1 = insertelement <2 x i64> %res0, i64 0, i32 1 diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 3071155172e35..964037ea80af8 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2291,8 +2291,8 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind { ; ; X64-LABEL: test_mm_set_epi64x: ; X64: # BB#0: -; X64-NEXT: movd %rdi, %xmm1 -; X64-NEXT: movd %rsi, %xmm0 +; X64-NEXT: movq %rdi, %xmm1 +; X64-NEXT: movq %rsi, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <2 x i64> undef, i64 %a1, i32 0 @@ -2433,7 +2433,7 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind { ; ; X64-LABEL: test_mm_set1_epi64x: ; X64: # BB#0: -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0 @@ -2685,8 +2685,8 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind { ; ; X64-LABEL: test_mm_setr_epi64x: ; X64: # BB#0: -; X64-NEXT: movd %rsi, %xmm1 -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rsi, %xmm1 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0 @@ -3249,7 +3249,7 @@ define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) { ; ; X64-LABEL: test_mm_storel_epi64: ; X64: # BB#0: -; X64-NEXT: movd %xmm0, %rax +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: movq %rax, (%rdi) ; X64-NEXT: retq %ext = extractelement <2 x i64> %a1, i32 0 diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index 33a4f413b6832..14c155c8c6c09 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -1808,32 +1808,32 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { ; GENERIC-LABEL: test_movd_64: ; GENERIC: # BB#0: -; GENERIC-NEXT: movd %rdi, %xmm1 +; GENERIC-NEXT: movq %rdi, %xmm1 ; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; GENERIC-NEXT: paddq %xmm0, %xmm1 ; GENERIC-NEXT: paddq %xmm0, %xmm2 -; GENERIC-NEXT: movd %xmm2, %rax +; GENERIC-NEXT: movq %xmm2, %rax ; GENERIC-NEXT: movq %xmm1, (%rsi) ; GENERIC-NEXT: retq ; ; ATOM-LABEL: test_movd_64: ; ATOM: # BB#0: ; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; ATOM-NEXT: movd %rdi, %xmm2 +; ATOM-NEXT: movq %rdi, %xmm2 ; ATOM-NEXT: paddq %xmm0, %xmm2 ; ATOM-NEXT: paddq %xmm0, %xmm1 ; ATOM-NEXT: movq %xmm2, (%rsi) -; ATOM-NEXT: movd %xmm1, %rax +; ATOM-NEXT: movq %xmm1, %rax ; ATOM-NEXT: retq ; ; SLM-LABEL: test_movd_64: ; SLM: # BB#0: ; SLM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [3:1.00] -; SLM-NEXT: movd %rdi, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movq %rdi, %xmm1 # sched: [1:0.50] ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] ; SLM-NEXT: movq %xmm1, (%rsi) # sched: [1:1.00] ; SLM-NEXT: paddq %xmm0, %xmm2 # sched: [1:0.50] -; SLM-NEXT: movd %xmm2, %rax # sched: [1:0.50] +; SLM-NEXT: movq %xmm2, %rax # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movd_64: @@ -3545,6 +3545,52 @@ define i16 @test_pextrw(<8 x i16> %a0) { ret i16 %1 } +define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) { +; GENERIC-LABEL: test_pinsrw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pinsrw $1, %edi, %xmm0 +; GENERIC-NEXT: pinsrw $3, (%rsi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pinsrw: +; ATOM: # BB#0: +; ATOM-NEXT: pinsrw $1, %edi, %xmm0 +; ATOM-NEXT: pinsrw $3, (%rsi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pinsrw: +; SLM: # BB#0: +; SLM-NEXT: pinsrw $1, %edi, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pinsrw: +; SANDY: # BB#0: +; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pinsrw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pinsrw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 + %2 = load i16, i16 *%a2 + %3 = insertelement <8 x i16> %1, i16 %2, i32 3 + ret <8 x i16> %3 +} + define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_pmaddwd: ; GENERIC: # BB#0: diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll new file mode 100644 index 0000000000000..482b2fcab6425 --- /dev/null +++ b/test/CodeGen/X86/sse3-schedule.ll @@ -0,0 +1,455 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_addsubpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addsubpd %xmm1, %xmm0 +; GENERIC-NEXT: addsubpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addsubpd: +; ATOM: # BB#0: +; ATOM-NEXT: addsubpd %xmm1, %xmm0 +; ATOM-NEXT: addsubpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addsubpd: +; SLM: # BB#0: +; SLM-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addsubpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone + +define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_addsubps: +; GENERIC: # BB#0: +; GENERIC-NEXT: addsubps %xmm1, %xmm0 +; GENERIC-NEXT: addsubps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addsubps: +; ATOM: # BB#0: +; ATOM-NEXT: addsubps %xmm1, %xmm0 +; ATOM-NEXT: addsubps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addsubps: +; SLM: # BB#0: +; SLM-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addsubps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_haddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: haddpd %xmm1, %xmm0 +; GENERIC-NEXT: haddpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_haddpd: +; ATOM: # BB#0: +; ATOM-NEXT: haddpd %xmm1, %xmm0 +; ATOM-NEXT: haddpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_haddpd: +; SLM: # BB#0: +; SLM-NEXT: haddpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: haddpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_haddpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone + +define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_haddps: +; GENERIC: # BB#0: +; GENERIC-NEXT: haddps %xmm1, %xmm0 +; GENERIC-NEXT: haddps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_haddps: +; ATOM: # BB#0: +; ATOM-NEXT: haddps %xmm1, %xmm0 +; ATOM-NEXT: haddps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_haddps: +; SLM: # BB#0: +; SLM-NEXT: haddps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: haddps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_haddps: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_hsubpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: hsubpd %xmm1, %xmm0 +; GENERIC-NEXT: hsubpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_hsubpd: +; ATOM: # BB#0: +; ATOM-NEXT: hsubpd %xmm1, %xmm0 +; ATOM-NEXT: hsubpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_hsubpd: +; SLM: # BB#0: +; SLM-NEXT: hsubpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: hsubpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_hsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone + +define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_hsubps: +; GENERIC: # BB#0: +; GENERIC-NEXT: hsubps %xmm1, %xmm0 +; GENERIC-NEXT: hsubps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_hsubps: +; ATOM: # BB#0: +; ATOM-NEXT: hsubps %xmm1, %xmm0 +; ATOM-NEXT: hsubps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_hsubps: +; SLM: # BB#0: +; SLM-NEXT: hsubps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: hsubps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_hsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x i8> @test_lddqu(i8* %a0) { +; GENERIC-LABEL: test_lddqu: +; GENERIC: # BB#0: +; GENERIC-NEXT: lddqu (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_lddqu: +; ATOM: # BB#0: +; ATOM-NEXT: lddqu (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lddqu: +; SLM: # BB#0: +; SLM-NEXT: lddqu (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lddqu: +; SANDY: # BB#0: +; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_lddqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lddqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) + ret <16 x i8> %1 +} +declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly + +define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movddup: +; GENERIC: # BB#0: +; GENERIC-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; GENERIC-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movddup: +; ATOM: # BB#0: +; ATOM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] +; ATOM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movddup: +; SLM: # BB#0: +; SLM-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00] +; SLM-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [3:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movddup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] +; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movddup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] +; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movddup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:1.00] +; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_movshdup: +; GENERIC: # BB#0: +; GENERIC-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; GENERIC-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movshdup: +; ATOM: # BB#0: +; ATOM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] +; ATOM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movshdup: +; SLM: # BB#0: +; SLM-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00] +; SLM-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [3:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movshdup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] +; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movshdup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] +; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movshdup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [5:1.00] +; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} + +define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_movsldup: +; GENERIC: # BB#0: +; GENERIC-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; GENERIC-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movsldup: +; ATOM: # BB#0: +; ATOM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2] +; ATOM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movsldup: +; SLM: # BB#0: +; SLM-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00] +; SLM-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [3:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movsldup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] +; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsldup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] +; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsldup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [5:1.00] +; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll new file mode 100644 index 0000000000000..340b9abe88797 --- /dev/null +++ b/test/CodeGen/X86/sse41-schedule.ll @@ -0,0 +1,1938 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_blendpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_blendpd: +; SLM: # BB#0: +; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_blendpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fadd <2 x double> %a1, %1 + %4 = shufflevector <2 x double> %3, <2 x double> %2, <2 x i32> <i32 0, i32 3> + ret <2 x double> %4 +} + +define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_blendps: +; GENERIC: # BB#0: +; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_blendps: +; SLM: # BB#0: +; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00] +; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_blendps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33] +; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] +; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %3 +} + +define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) { +; GENERIC-LABEL: test_blendvpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movapd %xmm0, %xmm3 +; GENERIC-NEXT: movaps %xmm2, %xmm0 +; GENERIC-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; GENERIC-NEXT: blendvpd %xmm0, (%rdi), %xmm3 +; GENERIC-NEXT: movapd %xmm3, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_blendvpd: +; SLM: # BB#0: +; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00] +; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00] +; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_blendvpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = load <2 x double>, <2 x double> *%a3, align 16 + %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone + +define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) { +; GENERIC-LABEL: test_blendvps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movaps %xmm0, %xmm3 +; GENERIC-NEXT: movaps %xmm2, %xmm0 +; GENERIC-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; GENERIC-NEXT: blendvps %xmm0, (%rdi), %xmm3 +; GENERIC-NEXT: movaps %xmm3, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_blendvps: +; SLM: # BB#0: +; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00] +; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00] +; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_blendvps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = load <4 x float>, <4 x float> *%a3 + %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_dppd: +; GENERIC: # BB#0: +; GENERIC-NEXT: dppd $7, %xmm1, %xmm0 +; GENERIC-NEXT: dppd $7, (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_dppd: +; SLM: # BB#0: +; SLM-NEXT: dppd $7, %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: dppd $7, (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_dppd: +; SANDY: # BB#0: +; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_dppd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_dppd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_dpps: +; GENERIC: # BB#0: +; GENERIC-NEXT: dpps $7, %xmm1, %xmm0 +; GENERIC-NEXT: dpps $7, (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_dpps: +; SLM: # BB#0: +; SLM-NEXT: dpps $7, %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: dpps $7, (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_dpps: +; SANDY: # BB#0: +; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_dpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00] +; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_dpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) { +; GENERIC-LABEL: test_insertps: +; GENERIC: # BB#0: +; GENERIC-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] +; GENERIC-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_insertps: +; SLM: # BB#0: +; SLM-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00] +; SLM-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_insertps: +; SANDY: # BB#0: +; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00] +; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_insertps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00] +; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_insertps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] +; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) + %2 = load float, float *%a2 + %3 = insertelement <4 x float> %1, float %2, i32 3 + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <2 x i64> @test_movntdqa(i8* %a0) { +; GENERIC-LABEL: test_movntdqa: +; GENERIC: # BB#0: +; GENERIC-NEXT: movntdqa (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_movntdqa: +; SLM: # BB#0: +; SLM-NEXT: movntdqa (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntdqa: +; SANDY: # BB#0: +; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntdqa: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntdqa: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0) + ret <2 x i64> %1 +} +declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone + +define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_mpsadbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: mpsadbw $7, %xmm1, %xmm0 +; GENERIC-NEXT: mpsadbw $7, (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_mpsadbw: +; SLM: # BB#0: +; SLM-NEXT: mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00] +; SLM-NEXT: mpsadbw $7, (%rdi), %xmm0 # sched: [10:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mpsadbw: +; SANDY: # BB#0: +; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00] +; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mpsadbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mpsadbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) + %2 = bitcast <8 x i16> %1 to <16 x i8> + %3 = load <16 x i8>, <16 x i8> *%a2, align 16 + %4 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %2, <16 x i8> %3, i8 7) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_packusdw: +; GENERIC: # BB#0: +; GENERIC-NEXT: packusdw %xmm1, %xmm0 +; GENERIC-NEXT: packusdw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_packusdw: +; SLM: # BB#0: +; SLM-NEXT: packusdw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packusdw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packusdw: +; SANDY: # BB#0: +; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packusdw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packusdw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %2, <4 x i32> %3) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone + +define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> *%a3) { +; GENERIC-LABEL: test_pblendvb: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm3 +; GENERIC-NEXT: movaps %xmm2, %xmm0 +; GENERIC-NEXT: pblendvb %xmm0, %xmm1, %xmm3 +; GENERIC-NEXT: pblendvb %xmm0, (%rdi), %xmm3 +; GENERIC-NEXT: movdqa %xmm3, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pblendvb: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00] +; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00] +; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pblendvb: +; SANDY: # BB#0: +; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pblendvb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pblendvb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) + %2 = load <16 x i8>, <16 x i8> *%a3, align 16 + %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pblendw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pblendw: +; SLM: # BB#0: +; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00] +; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pblendw: +; SANDY: # BB#0: +; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] +; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pblendw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00] +; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pblendw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] +; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15> + ret <8 x i16> %3 +} + +define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pcmpeqq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqq %xmm1, %xmm0 +; GENERIC-NEXT: pcmpeqq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpeqq: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pcmpeqq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqq: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <2 x i64> %a0, %a1 + %2 = sext <2 x i1> %1 to <2 x i64> + %3 = load <2 x i64>, <2 x i64>*%a2, align 16 + %4 = icmp eq <2 x i64> %2, %3 + %5 = sext <2 x i1> %4 to <2 x i64> + ret <2 x i64> %5 +} + +define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) { +; GENERIC-LABEL: test_pextrb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrb $3, %xmm0, %eax +; GENERIC-NEXT: pextrb $1, %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pextrb: +; SLM: # BB#0: +; SLM-NEXT: pextrb $3, %xmm0, %eax # sched: [1:1.00] +; SLM-NEXT: pextrb $1, %xmm0, (%rdi) # sched: [4:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrb: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <16 x i8> %a0, i32 3 + %2 = extractelement <16 x i8> %a0, i32 1 + store i8 %2, i8 *%a1 + %3 = zext i8 %1 to i32 + ret i32 %3 +} + +define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) { +; GENERIC-LABEL: test_pextrd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrd $3, %xmm0, %eax +; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pextrd: +; SLM: # BB#0: +; SLM-NEXT: pextrd $3, %xmm0, %eax # sched: [1:1.00] +; SLM-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [4:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrd: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <4 x i32> %a0, i32 3 + %2 = extractelement <4 x i32> %a0, i32 1 + store i32 %2, i32 *%a1 + ret i32 %1 +} + +define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) { +; GENERIC-LABEL: test_pextrq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrq $1, %xmm0, %rax +; GENERIC-NEXT: pextrq $1, %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pextrq: +; SLM: # BB#0: +; SLM-NEXT: pextrq $1, %xmm0, %rax # sched: [1:1.00] +; SLM-NEXT: pextrq $1, %xmm0, (%rdi) # sched: [4:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrq: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] +; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00] +; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] +; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <2 x i64> %a0, i32 1 + %2 = extractelement <2 x i64> %a0, i32 1 + store i64 %2, i64 *%a2 + ret i64 %1 +} + +define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) { +; GENERIC-LABEL: test_pextrw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrw $3, %xmm0, %eax +; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pextrw: +; SLM: # BB#0: +; SLM-NEXT: pextrw $3, %xmm0, %eax # sched: [4:1.00] +; SLM-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [4:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrw: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <8 x i16> %a0, i32 3 + %2 = extractelement <8 x i16> %a0, i32 1 + store i16 %2, i16 *%a1 + %3 = zext i16 %1 to i32 + ret i32 %3 +} + +define <8 x i16> @test_phminposuw(<8 x i16> *%a0) { +; GENERIC-LABEL: test_phminposuw: +; GENERIC: # BB#0: +; GENERIC-NEXT: phminposuw (%rdi), %xmm0 +; GENERIC-NEXT: phminposuw %xmm0, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_phminposuw: +; SLM: # BB#0: +; SLM-NEXT: phminposuw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: phminposuw %xmm0, %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phminposuw: +; SANDY: # BB#0: +; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phminposuw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phminposuw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <8 x i16>, <8 x i16> *%a0, align 16 + %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1) + %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone + +define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) { +; GENERIC-LABEL: test_pinsrb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pinsrb $1, %edi, %xmm0 +; GENERIC-NEXT: pinsrb $3, (%rsi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pinsrb: +; SLM: # BB#0: +; SLM-NEXT: pinsrb $1, %edi, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pinsrb: +; SANDY: # BB#0: +; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pinsrb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pinsrb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 + %2 = load i8, i8 *%a2 + %3 = insertelement <16 x i8> %1, i8 %2, i32 3 + ret <16 x i8> %3 +} + +define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_pinsrd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pinsrd $1, %edi, %xmm0 +; GENERIC-NEXT: pinsrd $3, (%rsi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pinsrd: +; SLM: # BB#0: +; SLM-NEXT: pinsrd $1, %edi, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pinsrd: +; SANDY: # BB#0: +; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pinsrd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pinsrd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1 + %2 = load i32, i32 *%a2 + %3 = insertelement <4 x i32> %1, i32 %2, i32 3 + ret <4 x i32> %3 +} + +define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { +; GENERIC-LABEL: test_pinsrq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pinsrq $1, %rdi, %xmm0 +; GENERIC-NEXT: pinsrq $1, (%rsi), %xmm1 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pinsrq: +; SLM: # BB#0: +; SLM-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pinsrq: +; SANDY: # BB#0: +; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pinsrq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pinsrq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1 + %2 = load i64, i64 *%a3 + %3 = insertelement <2 x i64> %a1, i64 %2, i32 1 + %4 = add <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pmaxsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxsb %xmm1, %xmm0 +; GENERIC-NEXT: pmaxsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmaxsb: +; SLM: # BB#0: +; SLM-NEXT: pmaxsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone + +define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmaxsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxsd %xmm1, %xmm0 +; GENERIC-NEXT: pmaxsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmaxsd: +; SLM: # BB#0: +; SLM-NEXT: pmaxsd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxsd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxsd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmaxud: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxud %xmm1, %xmm0 +; GENERIC-NEXT: pmaxud (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmaxud: +; SLM: # BB#0: +; SLM-NEXT: pmaxud %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxud (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxud: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxud: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxud: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmaxuw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxuw %xmm1, %xmm0 +; GENERIC-NEXT: pmaxuw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmaxuw: +; SLM: # BB#0: +; SLM-NEXT: pmaxuw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxuw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxuw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxuw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxuw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pminsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminsb %xmm1, %xmm0 +; GENERIC-NEXT: pminsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pminsb: +; SLM: # BB#0: +; SLM-NEXT: pminsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone + +define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pminsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminsd %xmm1, %xmm0 +; GENERIC-NEXT: pminsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pminsd: +; SLM: # BB#0: +; SLM-NEXT: pminsd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminsd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminsd: +; SANDY: # BB#0: +; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pminud: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminud %xmm1, %xmm0 +; GENERIC-NEXT: pminud (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pminud: +; SLM: # BB#0: +; SLM-NEXT: pminud %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminud (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminud: +; SANDY: # BB#0: +; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminud: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminud: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pminuw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminuw %xmm1, %xmm0 +; GENERIC-NEXT: pminuw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pminuw: +; SLM: # BB#0: +; SLM-NEXT: pminuw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminuw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminuw: +; SANDY: # BB#0: +; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminuw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminuw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) { +; GENERIC-LABEL: test_pmovsxbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxbw %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxbw (%rdi), %xmm0 +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxbw: +; SLM: # BB#0: +; SLM-NEXT: pmovsxbw (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxbw %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = load <8 x i8>, <8 x i8>* %a1, align 1 + %4 = sext <8 x i8> %3 to <8 x i16> + %5 = add <8 x i16> %2, %4 + ret <8 x i16> %5 +} + +define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) { +; GENERIC-LABEL: test_pmovsxbd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxbd %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxbd (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxbd: +; SLM: # BB#0: +; SLM-NEXT: pmovsxbd (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxbd %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxbd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxbd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxbd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = load <4 x i8>, <4 x i8>* %a1, align 1 + %4 = sext <4 x i8> %3 to <4 x i32> + %5 = add <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) { +; GENERIC-LABEL: test_pmovsxbq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxbq %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxbq (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxbq: +; SLM: # BB#0: +; SLM-NEXT: pmovsxbq (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxbq %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxbq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxbq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxbq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %2 = sext <2 x i8> %1 to <2 x i64> + %3 = load <2 x i8>, <2 x i8>* %a1, align 1 + %4 = sext <2 x i8> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) { +; GENERIC-LABEL: test_pmovsxdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxdq %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxdq (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxdq: +; SLM: # BB#0: +; SLM-NEXT: pmovsxdq (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxdq %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %2 = sext <2 x i32> %1 to <2 x i64> + %3 = load <2 x i32>, <2 x i32>* %a1, align 1 + %4 = sext <2 x i32> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) { +; GENERIC-LABEL: test_pmovsxwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxwd %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxwd (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxwd: +; SLM: # BB#0: +; SLM-NEXT: pmovsxwd (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxwd %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = load <4 x i16>, <4 x i16>* %a1, align 1 + %4 = sext <4 x i16> %3 to <4 x i32> + %5 = add <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) { +; GENERIC-LABEL: test_pmovsxwq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovsxwq %xmm0, %xmm1 +; GENERIC-NEXT: pmovsxwq (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovsxwq: +; SLM: # BB#0: +; SLM-NEXT: pmovsxwq (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: pmovsxwq %xmm0, %xmm0 # sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovsxwq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovsxwq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovsxwq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %2 = sext <2 x i16> %1 to <2 x i64> + %3 = load <2 x i16>, <2 x i16>* %a1, align 1 + %4 = sext <2 x i16> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) { +; GENERIC-LABEL: test_pmovzxbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxbw: +; SLM: # BB#0: +; SLM-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [4:1.00] +; SLM-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = load <8 x i8>, <8 x i8>* %a1, align 1 + %4 = zext <8 x i8> %3 to <8 x i16> + %5 = add <8 x i16> %2, %4 + ret <8 x i16> %5 +} + +define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) { +; GENERIC-LABEL: test_pmovzxbd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxbd: +; SLM: # BB#0: +; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [4:1.00] +; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxbd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxbd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxbd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = load <4 x i8>, <4 x i8>* %a1, align 1 + %4 = zext <4 x i8> %3 to <4 x i32> + %5 = add <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) { +; GENERIC-LABEL: test_pmovzxbq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxbq: +; SLM: # BB#0: +; SLM-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [4:1.00] +; SLM-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxbq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxbq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxbq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %2 = zext <2 x i8> %1 to <2 x i64> + %3 = load <2 x i8>, <2 x i8>* %a1, align 1 + %4 = zext <2 x i8> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) { +; GENERIC-LABEL: test_pmovzxdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxdq: +; SLM: # BB#0: +; SLM-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [4:1.00] +; SLM-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + %2 = zext <2 x i32> %1 to <2 x i64> + %3 = load <2 x i32>, <2 x i32>* %a1, align 1 + %4 = zext <2 x i32> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) { +; GENERIC-LABEL: test_pmovzxwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxwd: +; SLM: # BB#0: +; SLM-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [4:1.00] +; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = load <4 x i16>, <4 x i16>* %a1, align 1 + %4 = zext <4 x i16> %3 to <4 x i32> + %5 = add <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) { +; GENERIC-LABEL: test_pmovzxwq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmovzxwq: +; SLM: # BB#0: +; SLM-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [4:1.00] +; SLM-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovzxwq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] +; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovzxwq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00] +; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovzxwq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00] +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> + %2 = zext <2 x i16> %1 to <2 x i64> + %3 = load <2 x i16>, <2 x i16>* %a1, align 1 + %4 = zext <2 x i16> %3 to <2 x i64> + %5 = add <2 x i64> %2, %4 + ret <2 x i64> %5 +} + +define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmuldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmuldq %xmm1, %xmm0 +; GENERIC-NEXT: pmuldq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmuldq: +; SLM: # BB#0: +; SLM-NEXT: pmuldq %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmuldq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmuldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmuldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmuldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %2, <4 x i32> %3) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmulld: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulld %xmm1, %xmm0 +; GENERIC-NEXT: pmulld (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pmulld: +; SLM: # BB#0: +; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulld: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulld: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulld: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = mul <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = mul <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_ptest: +; GENERIC: # BB#0: +; GENERIC-NEXT: ptest %xmm1, %xmm0 +; GENERIC-NEXT: setb %al +; GENERIC-NEXT: ptest (%rdi), %xmm0 +; GENERIC-NEXT: setb %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: movzbl %cl, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ptest: +; SLM: # BB#0: +; SLM-NEXT: ptest %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: setb %al # sched: [1:0.50] +; SLM-NEXT: ptest (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: setb %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: movzbl %cl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ptest: +; SANDY: # BB#0: +; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: setb %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_ptest: +; HASWELL: # BB#0: +; HASWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: setb %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ptest: +; BTVER2: # BB#0: +; BTVER2-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vptest (%rdi), %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: setb %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2) + %4 = and i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone + +define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_roundpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: roundpd $7, %xmm0, %xmm1 +; GENERIC-NEXT: roundpd $7, (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_roundpd: +; SLM: # BB#0: +; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00] +; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_roundpd: +; SANDY: # BB#0: +; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_roundps: +; GENERIC: # BB#0: +; GENERIC-NEXT: roundps $7, %xmm0, %xmm1 +; GENERIC-NEXT: roundps $7, (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_roundps: +; SLM: # BB#0: +; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00] +; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_roundps: +; SANDY: # BB#0: +; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_roundsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movaps %xmm0, %xmm2 +; GENERIC-NEXT: roundsd $7, %xmm1, %xmm2 +; GENERIC-NEXT: roundsd $7, (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_roundsd: +; SLM: # BB#0: +; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_roundsd: +; SANDY: # BB#0: +; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00] +; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) + %2 = load <2 x double>, <2 x double>* %a2, align 16 + %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_roundss: +; GENERIC: # BB#0: +; GENERIC-NEXT: movaps %xmm0, %xmm2 +; GENERIC-NEXT: roundss $7, %xmm1, %xmm2 +; GENERIC-NEXT: roundss $7, (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_roundss: +; SLM: # BB#0: +; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00] +; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_roundss: +; SANDY: # BB#0: +; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00] +; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll new file mode 100644 index 0000000000000..afc48bc57ee7d --- /dev/null +++ b/test/CodeGen/X86/sse42-schedule.ll @@ -0,0 +1,477 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { +; GENERIC-LABEL: crc32_32_8: +; GENERIC: # BB#0: +; GENERIC-NEXT: crc32b %sil, %edi +; GENERIC-NEXT: crc32b (%rdx), %edi +; GENERIC-NEXT: movl %edi, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: crc32_32_8: +; SLM: # BB#0: +; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00] +; SLM-NEXT: movl %edi, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: crc32_32_8: +; SANDY: # BB#0: +; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: crc32_32_8: +; HASWELL: # BB#0: +; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: crc32_32_8: +; BTVER2: # BB#0: +; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] +; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1) + %2 = load i8, i8 *%a2 + %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2) + ret i32 %3 +} +declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind + +define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) { +; GENERIC-LABEL: crc32_32_16: +; GENERIC: # BB#0: +; GENERIC-NEXT: crc32w %si, %edi +; GENERIC-NEXT: crc32w (%rdx), %edi +; GENERIC-NEXT: movl %edi, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: crc32_32_16: +; SLM: # BB#0: +; SLM-NEXT: crc32w %si, %edi # sched: [3:1.00] +; SLM-NEXT: crc32w (%rdx), %edi # sched: [6:1.00] +; SLM-NEXT: movl %edi, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: crc32_32_16: +; SANDY: # BB#0: +; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00] +; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: crc32_32_16: +; HASWELL: # BB#0: +; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00] +; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00] +; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: crc32_32_16: +; BTVER2: # BB#0: +; BTVER2-NEXT: crc32w %si, %edi # sched: [3:1.00] +; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00] +; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1) + %2 = load i16, i16 *%a2 + %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2) + ret i32 %3 +} +declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind + +define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: crc32_32_32: +; GENERIC: # BB#0: +; GENERIC-NEXT: crc32l %esi, %edi +; GENERIC-NEXT: crc32l (%rdx), %edi +; GENERIC-NEXT: movl %edi, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: crc32_32_32: +; SLM: # BB#0: +; SLM-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; SLM-NEXT: crc32l (%rdx), %edi # sched: [6:1.00] +; SLM-NEXT: movl %edi, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: crc32_32_32: +; SANDY: # BB#0: +; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: crc32_32_32: +; HASWELL: # BB#0: +; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00] +; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: crc32_32_32: +; BTVER2: # BB#0: +; BTVER2-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00] +; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1) + %2 = load i32, i32 *%a2 + %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2) + ret i32 %3 +} +declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind + +define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind { +; GENERIC-LABEL: crc32_64_8: +; GENERIC: # BB#0: +; GENERIC-NEXT: crc32b %sil, %edi +; GENERIC-NEXT: crc32b (%rdx), %edi +; GENERIC-NEXT: movq %rdi, %rax +; GENERIC-NEXT: retq +; +; SLM-LABEL: crc32_64_8: +; SLM: # BB#0: +; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: crc32_64_8: +; SANDY: # BB#0: +; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: crc32_64_8: +; HASWELL: # BB#0: +; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: crc32_64_8: +; BTVER2: # BB#0: +; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1) + %2 = load i8, i8 *%a2 + %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2) + ret i64 %3 +} +declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind + +define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: crc32_64_64: +; GENERIC: # BB#0: +; GENERIC-NEXT: crc32q %rsi, %rdi +; GENERIC-NEXT: crc32q (%rdx), %rdi +; GENERIC-NEXT: movq %rdi, %rax +; GENERIC-NEXT: retq +; +; SLM-LABEL: crc32_64_64: +; SLM: # BB#0: +; SLM-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; SLM-NEXT: crc32q (%rdx), %rdi # sched: [6:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: crc32_64_64: +; SANDY: # BB#0: +; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: crc32_64_64: +; HASWELL: # BB#0: +; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: crc32_64_64: +; BTVER2: # BB#0: +; BTVER2-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) + %2 = load i64, i64 *%a2 + %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2) + ret i64 %3 +} +declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind + +define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpestri: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl $7, %eax +; GENERIC-NEXT: movl $7, %edx +; GENERIC-NEXT: pcmpestri $7, %xmm1, %xmm0 +; GENERIC-NEXT: movl %ecx, %esi +; GENERIC-NEXT: movl $7, %eax +; GENERIC-NEXT: movl $7, %edx +; GENERIC-NEXT: pcmpestri $7, (%rdi), %xmm0 +; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; GENERIC-NEXT: leal (%rcx,%rsi), %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpestri: +; SLM: # BB#0: +; SLM-NEXT: movl $7, %eax # sched: [1:0.50] +; SLM-NEXT: movl $7, %edx # sched: [1:0.50] +; SLM-NEXT: pcmpestri $7, %xmm1, %xmm0 # sched: [21:21.00] +; SLM-NEXT: movl $7, %eax # sched: [1:0.50] +; SLM-NEXT: movl $7, %edx # sched: [1:0.50] +; SLM-NEXT: movl %ecx, %esi # sched: [1:0.50] +; SLM-NEXT: pcmpestri $7, (%rdi), %xmm0 # sched: [21:21.00] +; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; SLM-NEXT: leal (%rcx,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpestri: +; SANDY: # BB#0: +; SANDY-NEXT: movl $7, %eax # sched: [1:0.33] +; SANDY-NEXT: movl $7, %edx # sched: [1:0.33] +; SANDY-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67] +; SANDY-NEXT: movl %ecx, %esi # sched: [1:0.33] +; SANDY-NEXT: movl $7, %eax # sched: [1:0.33] +; SANDY-NEXT: movl $7, %edx # sched: [1:0.33] +; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33] +; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpestri: +; HASWELL: # BB#0: +; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] +; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] +; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25] +; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] +; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] +; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpestri: +; BTVER2: # BB#0: +; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17] +; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] +; BTVER2-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [13:2.50] +; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17] +; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] +; BTVER2-NEXT: movl %ecx, %esi # sched: [1:0.17] +; BTVER2-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:2.50] +; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone + +define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpestrm: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl $7, %eax +; GENERIC-NEXT: movl $7, %edx +; GENERIC-NEXT: pcmpestrm $7, %xmm1, %xmm0 +; GENERIC-NEXT: movl $7, %eax +; GENERIC-NEXT: movl $7, %edx +; GENERIC-NEXT: pcmpestrm $7, (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpestrm: +; SLM: # BB#0: +; SLM-NEXT: movl $7, %eax # sched: [1:0.50] +; SLM-NEXT: movl $7, %edx # sched: [1:0.50] +; SLM-NEXT: pcmpestrm $7, %xmm1, %xmm0 # sched: [17:17.00] +; SLM-NEXT: movl $7, %eax # sched: [1:0.50] +; SLM-NEXT: movl $7, %edx # sched: [1:0.50] +; SLM-NEXT: pcmpestrm $7, (%rdi), %xmm0 # sched: [17:17.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpestrm: +; SANDY: # BB#0: +; SANDY-NEXT: movl $7, %eax # sched: [1:0.33] +; SANDY-NEXT: movl $7, %edx # sched: [1:0.33] +; SANDY-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67] +; SANDY-NEXT: movl $7, %eax # sched: [1:0.33] +; SANDY-NEXT: movl $7, %edx # sched: [1:0.33] +; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpestrm: +; HASWELL: # BB#0: +; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] +; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] +; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00] +; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] +; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] +; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpestrm: +; BTVER2: # BB#0: +; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17] +; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] +; BTVER2-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [13:2.50] +; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17] +; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] +; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone + +define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpistri: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpistri $7, %xmm1, %xmm0 +; GENERIC-NEXT: movl %ecx, %eax +; GENERIC-NEXT: pcmpistri $7, (%rdi), %xmm0 +; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; GENERIC-NEXT: leal (%rcx,%rax), %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpistri: +; SLM: # BB#0: +; SLM-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [17:17.00] +; SLM-NEXT: movl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: pcmpistri $7, (%rdi), %xmm0 # sched: [17:17.00] +; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; SLM-NEXT: leal (%rcx,%rax), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpistri: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00] +; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpistri: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: movl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpistri: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: movl %ecx, %eax # sched: [1:0.17] +; BTVER2-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:1.00] +; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpistrm: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpistrm $7, %xmm1, %xmm0 +; GENERIC-NEXT: pcmpistrm $7, (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpistrm: +; SLM: # BB#0: +; SLM-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [13:13.00] +; SLM-NEXT: pcmpistrm $7, (%rdi), %xmm0 # sched: [13:13.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpistrm: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpistrm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00] +; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpistrm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pcmpgtq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpgtq %xmm1, %xmm0 +; GENERIC-NEXT: pcmpgtq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_pcmpgtq: +; SLM: # BB#0: +; SLM-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pcmpgtq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtq: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <2 x i64> %a0, %a1 + %2 = sext <2 x i1> %1 to <2 x i64> + %3 = load <2 x i64>, <2 x i64>*%a2, align 16 + %4 = icmp sgt <2 x i64> %2, %3 + %5 = sext <2 x i1> %4 to <2 x i64> + ret <2 x i64> %5 +} diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll new file mode 100644 index 0000000000000..8b7a0c0ec02b6 --- /dev/null +++ b/test/CodeGen/X86/ssse3-schedule.ll @@ -0,0 +1,754 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { +; GENERIC-LABEL: test_pabsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pabsb %xmm0, %xmm1 +; GENERIC-NEXT: pabsb (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pabsb: +; ATOM: # BB#0: +; ATOM-NEXT: pabsb (%rdi), %xmm1 +; ATOM-NEXT: pabsb %xmm0, %xmm0 +; ATOM-NEXT: por %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pabsb: +; SLM: # BB#0: +; SLM-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pabsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pabsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pabsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pabsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpabsb (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) + %2 = load <16 x i8>, <16 x i8> *%a1, align 16 + %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2) + %4 = or <16 x i8> %1, %3 + ret <16 x i8> %4 +} +declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone + +define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_pabsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pabsd %xmm0, %xmm1 +; GENERIC-NEXT: pabsd (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pabsd: +; ATOM: # BB#0: +; ATOM-NEXT: pabsd (%rdi), %xmm1 +; ATOM-NEXT: pabsd %xmm0, %xmm0 +; ATOM-NEXT: por %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pabsd: +; SLM: # BB#0: +; SLM-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pabsd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pabsd: +; SANDY: # BB#0: +; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pabsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pabsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpabsd (%rdi), %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) + %2 = load <4 x i32>, <4 x i32> *%a1, align 16 + %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2) + %4 = or <4 x i32> %1, %3 + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone + +define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) { +; GENERIC-LABEL: test_pabsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pabsw %xmm0, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pabsw: +; ATOM: # BB#0: +; ATOM-NEXT: pabsw %xmm0, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pabsw: +; SLM: # BB#0: +; SLM-NEXT: pabsw %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pabsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pabsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pabsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) + %2 = load <8 x i16>, <8 x i16> *%a1, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2) + %4 = or <8 x i16> %1, %3 + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone + +define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_palignr: +; GENERIC: # BB#0: +; GENERIC-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; GENERIC-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; GENERIC-NEXT: movdqa %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_palignr: +; ATOM: # BB#0: +; ATOM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; ATOM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_palignr: +; SLM: # BB#0: +; SLM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00] +; SLM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [4:1.00] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_palignr: +; SANDY: # BB#0: +; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] +; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_palignr: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00] +; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_palignr: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] +; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> + ret <8 x i16> %3 +} + +define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_phaddd: +; GENERIC: # BB#0: +; GENERIC-NEXT: phaddd %xmm1, %xmm0 +; GENERIC-NEXT: phaddd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phaddd: +; ATOM: # BB#0: +; ATOM-NEXT: phaddd %xmm1, %xmm0 +; ATOM-NEXT: phaddd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phaddd: +; SLM: # BB#0: +; SLM-NEXT: phaddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phaddd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phaddd: +; SANDY: # BB#0: +; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phaddd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phaddd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_phaddsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: phaddsw %xmm1, %xmm0 +; GENERIC-NEXT: phaddsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phaddsw: +; ATOM: # BB#0: +; ATOM-NEXT: phaddsw %xmm1, %xmm0 +; ATOM-NEXT: phaddsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phaddsw: +; SLM: # BB#0: +; SLM-NEXT: phaddsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phaddsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phaddsw: +; SANDY: # BB#0: +; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phaddsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phaddsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_phaddw: +; GENERIC: # BB#0: +; GENERIC-NEXT: phaddw %xmm1, %xmm0 +; GENERIC-NEXT: phaddw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phaddw: +; ATOM: # BB#0: +; ATOM-NEXT: phaddw %xmm1, %xmm0 +; ATOM-NEXT: phaddw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phaddw: +; SLM: # BB#0: +; SLM-NEXT: phaddw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phaddw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phaddw: +; SANDY: # BB#0: +; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phaddw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phaddw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_phsubd: +; GENERIC: # BB#0: +; GENERIC-NEXT: phsubd %xmm1, %xmm0 +; GENERIC-NEXT: phsubd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phsubd: +; ATOM: # BB#0: +; ATOM-NEXT: phsubd %xmm1, %xmm0 +; ATOM-NEXT: phsubd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phsubd: +; SLM: # BB#0: +; SLM-NEXT: phsubd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phsubd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phsubd: +; SANDY: # BB#0: +; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phsubd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phsubd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_phsubsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: phsubsw %xmm1, %xmm0 +; GENERIC-NEXT: phsubsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phsubsw: +; ATOM: # BB#0: +; ATOM-NEXT: phsubsw %xmm1, %xmm0 +; ATOM-NEXT: phsubsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phsubsw: +; SLM: # BB#0: +; SLM-NEXT: phsubsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phsubsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phsubsw: +; SANDY: # BB#0: +; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phsubsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phsubsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_phsubw: +; GENERIC: # BB#0: +; GENERIC-NEXT: phsubw %xmm1, %xmm0 +; GENERIC-NEXT: phsubw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_phsubw: +; ATOM: # BB#0: +; ATOM-NEXT: phsubw %xmm1, %xmm0 +; ATOM-NEXT: phsubw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_phsubw: +; SLM: # BB#0: +; SLM-NEXT: phsubw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: phsubw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_phsubw: +; SANDY: # BB#0: +; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_phsubw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_phsubw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pmaddubsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaddubsw %xmm1, %xmm0 +; GENERIC-NEXT: pmaddubsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaddubsw: +; ATOM: # BB#0: +; ATOM-NEXT: pmaddubsw %xmm1, %xmm0 +; ATOM-NEXT: pmaddubsw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaddubsw: +; SLM: # BB#0: +; SLM-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaddubsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaddubsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaddubsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = bitcast <8 x i16> %1 to <16 x i8> + %4 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %3, <16 x i8> %2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmulhrsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulhrsw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmulhrsw: +; ATOM: # BB#0: +; ATOM-NEXT: pmulhrsw %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmulhrsw: +; SLM: # BB#0: +; SLM-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulhrsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulhrsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulhrsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pshufb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshufb %xmm1, %xmm0 +; GENERIC-NEXT: pshufb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshufb: +; ATOM: # BB#0: +; ATOM-NEXT: pshufb %xmm1, %xmm0 +; ATOM-NEXT: pshufb (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshufb: +; SLM: # BB#0: +; SLM-NEXT: pshufb %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pshufb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshufb: +; SANDY: # BB#0: +; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshufb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshufb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psignb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psignb %xmm1, %xmm0 +; GENERIC-NEXT: psignb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psignb: +; ATOM: # BB#0: +; ATOM-NEXT: psignb %xmm1, %xmm0 +; ATOM-NEXT: psignb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psignb: +; SLM: # BB#0: +; SLM-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psignb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psignb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psignb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psignb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone + +define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psignd: +; GENERIC: # BB#0: +; GENERIC-NEXT: psignd %xmm1, %xmm0 +; GENERIC-NEXT: psignd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psignd: +; ATOM: # BB#0: +; ATOM-NEXT: psignd %xmm1, %xmm0 +; ATOM-NEXT: psignd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psignd: +; SLM: # BB#0: +; SLM-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psignd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psignd: +; SANDY: # BB#0: +; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psignd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psignd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 +} +declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psignw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psignw %xmm1, %xmm0 +; GENERIC-NEXT: psignw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psignw: +; ATOM: # BB#0: +; ATOM-NEXT: psignw %xmm1, %xmm0 +; ATOM-NEXT: psignw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psignw: +; SLM: # BB#0: +; SLM-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psignw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psignw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psignw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psignw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll index cc384e19394f2..15fb25777eddc 100644 --- a/test/CodeGen/X86/statepoint-vector.ll +++ b/test/CodeGen/X86/statepoint-vector.ll @@ -22,7 +22,7 @@ define <2 x i8 addrspace(1)*> @test2(<2 x i8 addrspace(1)*> %obj, i64 %offset) g entry: ; CHECK-LABEL: @test2 ; CHECK: subq $40, %rsp -; CHECK: movd %rdi, %xmm1 +; CHECK: movq %rdi, %xmm1 ; CHECK: pshufd $68, %xmm1, %xmm1 # xmm1 = xmm1[0,1,0,1] ; CHECK: paddq %xmm0, %xmm1 ; CHECK: movdqa %xmm0, 16(%rsp) diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll index 805bc25c17b62..ac0b43b2402f8 100644 --- a/test/CodeGen/X86/tls-pic.ll +++ b/test/CodeGen/X86/tls-pic.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32 %s -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64 @i = thread_local global i32 15 @j = internal thread_local global i32 42 @@ -11,9 +11,9 @@ entry: ret i32 %tmp1 } -; X32-LABEL: f1: -; X32: leal i@TLSGD(,%ebx), %eax -; X32: calll ___tls_get_addr@PLT +; X86-LABEL: f1: +; X86: leal i@TLSGD(,%ebx), %eax +; X86: calll ___tls_get_addr@PLT ; X64-LABEL: f1: ; X64: leaq i@TLSGD(%rip), %rdi @@ -27,9 +27,9 @@ entry: ret i32* @i } -; X32-LABEL: f2: -; X32: leal i@TLSGD(,%ebx), %eax -; X32: calll ___tls_get_addr@PLT +; X86-LABEL: f2: +; X86: leal i@TLSGD(,%ebx), %eax +; X86: calll ___tls_get_addr@PLT ; X64-LABEL: f2: ; X64: leaq i@TLSGD(%rip), %rdi @@ -43,9 +43,9 @@ entry: ret i32 %tmp1 } -; X32-LABEL: f3: -; X32: leal i@TLSGD(,%ebx), %eax -; X32: calll ___tls_get_addr@PLT +; X86-LABEL: f3: +; X86: leal i@TLSGD(,%ebx), %eax +; X86: calll ___tls_get_addr@PLT ; X64-LABEL: f3: ; X64: leaq i@TLSGD(%rip), %rdi @@ -57,9 +57,9 @@ entry: ret i32* @i } -; X32-LABEL: f4: -; X32: leal i@TLSGD(,%ebx), %eax -; X32: calll ___tls_get_addr@PLT +; X86-LABEL: f4: +; X86: leal i@TLSGD(,%ebx), %eax +; X86: calll ___tls_get_addr@PLT ; X64-LABEL: f4: ; X64: leaq i@TLSGD(%rip), %rdi @@ -74,11 +74,11 @@ entry: ret i32 %add } -; X32-LABEL: f5: -; X32: leal {{[jk]}}@TLSLDM(%ebx) -; X32: calll ___tls_get_addr@PLT -; X32: movl {{[jk]}}@DTPOFF(%e -; X32: addl {{[jk]}}@DTPOFF(%e +; X86-LABEL: f5: +; X86: leal {{[jk]}}@TLSLDM(%ebx) +; X86: calll ___tls_get_addr@PLT +; X86: movl {{[jk]}}@DTPOFF(%e +; X86: addl {{[jk]}}@DTPOFF(%e ; X64-LABEL: f5: ; X64: leaq {{[jk]}}@TLSLD(%rip), %rdi diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll index 842a3bab66473..7a7e40362bcf9 100644 --- a/test/CodeGen/X86/tls-pie.ll +++ b/test/CodeGen/X86/tls-pie.ll @@ -1,81 +1,112 @@ -; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \ -; RUN: | FileCheck -check-prefix=X32 %s -; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \ -; RUN: | FileCheck -check-prefix=X64 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnux32 -relocation-model=pic | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64 @i = thread_local global i32 15 @i2 = external thread_local global i32 define i32 @f1() { +; X86-LABEL: f1: +; X86: # BB#0: # %entry +; X86-NEXT: movl %gs:i@NTPOFF, %eax +; X86-NEXT: retl +; ; X32-LABEL: f1: -; X32: movl %gs:i@NTPOFF, %eax -; X32-NEXT: ret +; X32: # BB#0: # %entry +; X32-NEXT: movl %fs:i@TPOFF, %eax +; X32-NEXT: retq +; ; X64-LABEL: f1: -; X64: movl %fs:i@TPOFF, %eax -; X64-NEXT: ret - +; X64: # BB#0: # %entry +; X64-NEXT: movl %fs:i@TPOFF, %eax +; X64-NEXT: retq entry: %tmp1 = load i32, i32* @i ret i32 %tmp1 } define i32* @f2() { +; X86-LABEL: f2: +; X86: # BB#0: # %entry +; X86-NEXT: movl %gs:0, %eax +; X86-NEXT: leal i@NTPOFF(%eax), %eax +; X86-NEXT: retl +; ; X32-LABEL: f2: -; X32: movl %gs:0, %eax -; X32-NEXT: leal i@NTPOFF(%eax), %eax -; X32-NEXT: ret +; X32: # BB#0: # %entry +; X32-NEXT: movl %fs:0, %eax +; X32-NEXT: leal i@TPOFF(%rax), %eax +; X32-NEXT: retq +; ; X64-LABEL: f2: -; X64: movq %fs:0, %rax -; X64-NEXT: leaq i@TPOFF(%rax), %rax -; X64-NEXT: ret - +; X64: # BB#0: # %entry +; X64-NEXT: movq %fs:0, %rax +; X64-NEXT: leaq i@TPOFF(%rax), %rax +; X64-NEXT: retq entry: ret i32* @i } define i32 @f3() { +; X86-LABEL: f3: +; X86: # BB#0: # %entry +; X86-NEXT: calll .L2$pb +; X86-NEXT: .Lcfi0: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: .L2$pb: +; X86-NEXT: popl %eax +; X86-NEXT: .Lcfi1: +; X86-NEXT: .cfi_adjust_cfa_offset -4 +; X86-NEXT: .Ltmp0: +; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L2$pb), %eax +; X86-NEXT: movl i2@GOTNTPOFF(%eax), %eax +; X86-NEXT: movl %gs:(%eax), %eax +; X86-NEXT: retl +; ; X32-LABEL: f3: -; X32: calll .L{{[0-9]+}}$pb -; X32-NEXT: .Lcfi{{[0-9]+}}: -; X32-NEXT: .cfi_adjust_cfa_offset 4 -; X32-NEXT: .L{{[0-9]+}}$pb: -; X32-NEXT: popl %eax -; X32-NEXT: .Lcfi{{[0-9]+}}: -; X32-NEXT: .cfi_adjust_cfa_offset -4 -; X32-NEXT: .Ltmp{{[0-9]+}}: -; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax -; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax -; X32-NEXT: movl %gs:(%eax), %eax -; X32-NEXT: ret +; X32: # BB#0: # %entry +; X32-NEXT: movl i2@{{.*}}(%rip), %eax +; X32-NEXT: movl %fs:(%eax), %eax +; X32-NEXT: retq +; ; X64-LABEL: f3: -; X64: movq i2@GOTTPOFF(%rip), %rax -; X64-NEXT: movl %fs:(%rax), %eax -; X64-NEXT: ret - +; X64: # BB#0: # %entry +; X64-NEXT: movq i2@{{.*}}(%rip), %rax +; X64-NEXT: movl %fs:(%rax), %eax +; X64-NEXT: retq entry: %tmp1 = load i32, i32* @i2 ret i32 %tmp1 } define i32* @f4() { +; X86-LABEL: f4: +; X86: # BB#0: # %entry +; X86-NEXT: calll .L3$pb +; X86-NEXT: .Lcfi2: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: .L3$pb: +; X86-NEXT: popl %ecx +; X86-NEXT: .Lcfi3: +; X86-NEXT: .cfi_adjust_cfa_offset -4 +; X86-NEXT: .Ltmp1: +; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L3$pb), %ecx +; X86-NEXT: movl %gs:0, %eax +; X86-NEXT: addl i2@GOTNTPOFF(%ecx), %eax +; X86-NEXT: retl +; ; X32-LABEL: f4: -; X32: calll .L{{[0-9]+}}$pb -; X32-NEXT: .Lcfi{{[0-9]+}}: -; X32-NEXT: .cfi_adjust_cfa_offset 4 -; X32-NEXT: .L{{[0-9]+}}$pb: -; X32-NEXT: popl %ecx -; X32-NEXT: .Lcfi{{[0-9]+}}: -; X32-NEXT: .cfi_adjust_cfa_offset -4 -; X32-NEXT: .Ltmp{{[0-9]+}}: -; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx -; X32-NEXT: movl %gs:0, %eax -; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax -; X32-NEXT: ret +; X32: # BB#0: # %entry +; X32-NEXT: movl %fs:0, %eax +; X32-NEXT: addl i2@{{.*}}(%rip), %eax +; X32-NEXT: retq +; ; X64-LABEL: f4: -; X64: movq %fs:0, %rax -; X64-NEXT: addq i2@GOTTPOFF(%rip), %rax -; X64-NEXT: ret - +; X64: # BB#0: # %entry +; X64-NEXT: movq %fs:0, %rax +; X64-NEXT: addq i2@{{.*}}(%rip), %rax +; X64-NEXT: retq entry: ret i32* @i2 } diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll index 85c51e618b2a7..d39716aab7643 100644 --- a/test/CodeGen/X86/tls.ll +++ b/test/CodeGen/X86/tls.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32_LINUX %s +; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s -; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X32_WIN %s +; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X86_WIN %s ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s ; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s @@ -16,18 +16,18 @@ @b2 = thread_local(localexec) global i8 0 define i32 @f1() { -; X32_LINUX-LABEL: f1: -; X32_LINUX: movl %gs:i1@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f1: +; X86_LINUX: movl %gs:i1@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f1: ; X64_LINUX: movl %fs:i1@TPOFF, %eax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f1: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movl _i1@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f1: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movl _i1@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f1: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -47,20 +47,20 @@ entry: } define i32* @f2() { -; X32_LINUX-LABEL: f2: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal i1@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f2: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal i1@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f2: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: leaq i1@TPOFF(%rax), %rax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f2: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: leal _i1@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f2: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: leal _i1@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f2: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -79,20 +79,20 @@ entry: } define i32 @f3() nounwind { -; X32_LINUX-LABEL: f3: -; X32_LINUX: movl i2@INDNTPOFF, %eax -; X32_LINUX-NEXT: movl %gs:(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f3: +; X86_LINUX: movl i2@INDNTPOFF, %eax +; X86_LINUX-NEXT: movl %gs:(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f3: ; X64_LINUX: movq i2@GOTTPOFF(%rip), %rax ; X64_LINUX-NEXT: movl %fs:(%rax), %eax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f3: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movl _i2@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f3: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movl _i2@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f3: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -112,20 +112,20 @@ entry: } define i32* @f4() { -; X32_LINUX-LABEL: f4: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: addl i2@INDNTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f4: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: addl i2@INDNTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f4: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: addq i2@GOTTPOFF(%rip), %rax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f4: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: leal _i2@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f4: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: leal _i2@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f4: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -144,18 +144,18 @@ entry: } define i32 @f5() nounwind { -; X32_LINUX-LABEL: f5: -; X32_LINUX: movl %gs:i3@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f5: +; X86_LINUX: movl %gs:i3@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f5: ; X64_LINUX: movl %fs:i3@TPOFF, %eax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f5: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movl _i3@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f5: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movl _i3@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f5: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -175,20 +175,20 @@ entry: } define i32* @f6() { -; X32_LINUX-LABEL: f6: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal i3@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f6: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal i3@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f6: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: leaq i3@TPOFF(%rax), %rax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f6: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: leal _i3@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f6: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: leal _i3@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f6: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -207,9 +207,9 @@ entry: } define i32 @f7() { -; X32_LINUX-LABEL: f7: -; X32_LINUX: movl %gs:i4@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f7: +; X86_LINUX: movl %gs:i4@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f7: ; X64_LINUX: movl %fs:i4@TPOFF, %eax ; X64_LINUX-NEXT: ret @@ -226,10 +226,10 @@ entry: } define i32* @f8() { -; X32_LINUX-LABEL: f8: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal i4@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f8: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal i4@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f8: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax @@ -246,9 +246,9 @@ entry: } define i32 @f9() { -; X32_LINUX-LABEL: f9: -; X32_LINUX: movl %gs:i5@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f9: +; X86_LINUX: movl %gs:i5@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f9: ; X64_LINUX: movl %fs:i5@TPOFF, %eax ; X64_LINUX-NEXT: ret @@ -265,10 +265,10 @@ entry: } define i32* @f10() { -; X32_LINUX-LABEL: f10: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal i5@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f10: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal i5@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f10: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: leaq i5@TPOFF(%rax), %rax @@ -285,18 +285,18 @@ entry: } define i16 @f11() { -; X32_LINUX-LABEL: f11: -; X32_LINUX: movzwl %gs:s1@NTPOFF, %eax -; X32_LINUX: ret +; X86_LINUX-LABEL: f11: +; X86_LINUX: movzwl %gs:s1@NTPOFF, %eax +; X86_LINUX: ret ; X64_LINUX-LABEL: f11: ; X64_LINUX: movzwl %fs:s1@TPOFF, %eax ; X64_LINUX: ret -; X32_WIN-LABEL: f11: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax -; X32_WIN: ret +; X86_WIN-LABEL: f11: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax +; X86_WIN: ret ; X64_WIN-LABEL: f11: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -316,18 +316,18 @@ entry: } define i32 @f12() { -; X32_LINUX-LABEL: f12: -; X32_LINUX: movswl %gs:s1@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f12: +; X86_LINUX: movswl %gs:s1@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f12: ; X64_LINUX: movswl %fs:s1@TPOFF, %eax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f12: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movswl _s1@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f12: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movswl _s1@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f12: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -349,18 +349,18 @@ entry: } define i8 @f13() { -; X32_LINUX-LABEL: f13: -; X32_LINUX: movb %gs:b1@NTPOFF, %al -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f13: +; X86_LINUX: movb %gs:b1@NTPOFF, %al +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f13: ; X64_LINUX: movb %fs:b1@TPOFF, %al ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f13: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movb _b1@SECREL32(%eax), %al -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f13: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movb _b1@SECREL32(%eax), %al +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f13: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -380,18 +380,18 @@ entry: } define i32 @f14() { -; X32_LINUX-LABEL: f14: -; X32_LINUX: movsbl %gs:b1@NTPOFF, %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f14: +; X86_LINUX: movsbl %gs:b1@NTPOFF, %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f14: ; X64_LINUX: movsbl %fs:b1@TPOFF, %eax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f14: -; X32_WIN: movl __tls_index, %eax -; X32_WIN-NEXT: movl %fs:__tls_array, %ecx -; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax -; X32_WIN-NEXT: movsbl _b1@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f14: +; X86_WIN: movl __tls_index, %eax +; X86_WIN-NEXT: movl %fs:__tls_array, %ecx +; X86_WIN-NEXT: movl (%ecx,%eax,4), %eax +; X86_WIN-NEXT: movsbl _b1@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f14: ; X64_WIN: movl _tls_index(%rip), %eax ; X64_WIN-NEXT: movq %gs:88, %rcx @@ -412,19 +412,19 @@ entry: } define i8* @f15() { -; X32_LINUX-LABEL: f15: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal b2@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f15: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal b2@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f15: ; X64_LINUX: movq %fs:0, %rax ; X64_LINUX-NEXT: leaq b2@TPOFF(%rax), %rax ; X64_LINUX-NEXT: ret -; X32_WIN-LABEL: f15: -; X32_WIN: movl %fs:__tls_array, %eax -; X32_WIN-NEXT: movl (%eax), %eax -; X32_WIN-NEXT: leal _b2@SECREL32(%eax), %eax -; X32_WIN-NEXT: ret +; X86_WIN-LABEL: f15: +; X86_WIN: movl %fs:__tls_array, %eax +; X86_WIN-NEXT: movl (%eax), %eax +; X86_WIN-NEXT: leal _b2@SECREL32(%eax), %eax +; X86_WIN-NEXT: ret ; X64_WIN-LABEL: f15: ; X64_WIN: movq %gs:88, %rax ; X64_WIN-NEXT: movq (%rax), %rax @@ -441,10 +441,10 @@ entry: define i32* @f16() { -; X32_LINUX-LABEL: f16: -; X32_LINUX: movl %gs:0, %eax -; X32_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax -; X32_LINUX-NEXT: ret +; X86_LINUX-LABEL: f16: +; X86_LINUX: movl %gs:0, %eax +; X86_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax +; X86_LINUX-NEXT: ret ; X64_LINUX-LABEL: f16: ; X64_LINUX: movq %fs:0, %rax diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll index 78799ff04fe1c..9804f0ef983ba 100644 --- a/test/CodeGen/X86/vec_fneg.ll +++ b/test/CodeGen/X86/vec_fneg.ll @@ -10,7 +10,7 @@ define <4 x float> @t1(<4 x float> %Q) nounwind { ; X32-SSE-LABEL: t1: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: xorps .LCPI0_0, %xmm0 +; X32-SSE-NEXT: xorps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl ; ; X64-SSE-LABEL: t1: @@ -92,7 +92,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind { ; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000 ; X64-SSE2-NEXT: xorq %rdi, %rax -; X64-SSE2-NEXT: movd %rax, %xmm0 +; X64-SSE2-NEXT: movq %rax, %xmm0 ; X64-SSE2-NEXT: retq %bitcast = bitcast i64 %i to <2 x float> %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index a345f78e18c13..477150016486b 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -20,10 +20,10 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i64: ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -128,16 +128,16 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_4i64: ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movq %rax, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -263,7 +263,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm2, %xmm3 @@ -272,7 +272,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -347,7 +347,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm2, %xmm3 @@ -356,7 +356,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] @@ -428,7 +428,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm1, %xmm3 @@ -437,7 +437,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: retq @@ -507,7 +507,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm2, %xmm3 @@ -516,13 +516,13 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm0 +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovbq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, %xmm0 @@ -586,7 +586,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm2, %rdx ; SSE-NEXT: ucomisd %xmm3, %xmm2 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm0 +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: subsd %xmm3, %xmm4 @@ -595,7 +595,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm2, %rdx ; SSE-NEXT: ucomisd %xmm3, %xmm2 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movapd %xmm1, %xmm2 ; SSE-NEXT: subsd %xmm3, %xmm2 @@ -604,7 +604,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm1, %rdx ; SSE-NEXT: ucomisd %xmm3, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subsd %xmm3, %xmm4 @@ -613,7 +613,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: ucomisd %xmm3, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq @@ -761,7 +761,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm1, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subsd %xmm2, %xmm4 @@ -770,7 +770,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm1, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: subsd %xmm2, %xmm1 @@ -779,7 +779,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rdx ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subsd %xmm2, %xmm4 @@ -788,7 +788,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE-NEXT: movaps %xmm1, %xmm0 @@ -879,10 +879,10 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i64: ; SSE: # BB#0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -940,10 +940,10 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptosi_4f32_to_2i64: ; SSE: # BB#0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1016,19 +1016,19 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; SSE-LABEL: fptosi_4f32_to_4i64: ; SSE: # BB#0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movq %rax, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1124,19 +1124,19 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; SSE-LABEL: fptosi_8f32_to_4i64: ; SSE: # BB#0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm3 +; SSE-NEXT: movq %rax, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1245,7 +1245,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rdx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subss %xmm2, %xmm3 @@ -1254,7 +1254,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1390,7 +1390,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rdx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subss %xmm2, %xmm3 @@ -1399,7 +1399,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1477,7 +1477,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rdx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movd %rdx, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subss %xmm2, %xmm3 @@ -1486,7 +1486,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1685,7 +1685,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -1695,7 +1695,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm3, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] @@ -1706,7 +1706,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm3, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subss %xmm1, %xmm4 @@ -1715,7 +1715,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1863,7 +1863,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm2 +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -1873,7 +1873,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm3, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] @@ -1884,7 +1884,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm3, %rdx ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movd %rdx, %xmm3 +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subss %xmm1, %xmm4 @@ -1893,7 +1893,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: ucomiss %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -2257,9 +2257,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; SSE-NEXT: movzwl %ax, %edi ; SSE-NEXT: callq __gnu_h2f_ieee ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] @@ -2407,12 +2407,12 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-NEXT: movq %rdx, %rdi ; SSE-NEXT: movq %rcx, %rsi ; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movq %rbx, %rdi ; SSE-NEXT: movq %r14, %rsi ; SSE-NEXT: callq __fixtfdi -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] ; SSE-NEXT: xorps %xmm1, %xmm1 diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll index 2d55ffbd6e7a6..ff8b1f14c52de 100644 --- a/test/CodeGen/X86/vec_insert-3.ll +++ b/test/CodeGen/X86/vec_insert-3.ll @@ -15,7 +15,7 @@ define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind { ; ; X64-LABEL: t1: ; X64: # BB#0: -; X64-NEXT: movd %rdi, %xmm1 +; X64-NEXT: movq %rdi, %xmm1 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1 diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index a37c377e890e5..e7c06a99df9cc 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -19,7 +19,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; X64: # BB#0: ; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> ; X64-NEXT: shll $12, %edi -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rsi) diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll index 143957e29ed67..fffafe7697dad 100644 --- a/test/CodeGen/X86/vec_insert-mmx.ll +++ b/test/CodeGen/X86/vec_insert-mmx.ll @@ -17,7 +17,7 @@ define x86_mmx @t0(i32 %A) nounwind { ; X64-LABEL: t0: ; X64: ## BB#0: ; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 649b45712f578..a42b3c96c3ae6 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -19,10 +19,10 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { ; SSE-LABEL: sitofp_2i64_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -217,17 +217,17 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f64: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2sdq %rax, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] @@ -1047,10 +1047,10 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE-LABEL: sitofp_2i64_to_4f32: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -1111,10 +1111,10 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { ; SSE-LABEL: sitofp_2i64_to_4f32_zero: ; SSE: # BB#0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1170,11 +1170,11 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f32_undef: ; SSE: # BB#0: ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1367,17 +1367,17 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f32: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1610,7 +1610,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE-LABEL: uitofp_2i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB39_1 ; SSE-NEXT: # BB#2: @@ -1627,7 +1627,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB39_3: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB39_4 ; SSE-NEXT: # BB#5: @@ -1729,7 +1729,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; SSE-LABEL: uitofp_2i64_to_2f32: ; SSE: # BB#0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB40_1 ; SSE-NEXT: # BB#2: @@ -1745,7 +1745,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB40_3: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB40_4 ; SSE-NEXT: # BB#5: @@ -1845,7 +1845,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: .LBB41_2: -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB41_3 ; SSE-NEXT: # BB#4: @@ -1863,7 +1863,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: .LBB41_5: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB41_6 ; SSE-NEXT: # BB#7: @@ -2145,7 +2145,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-LABEL: uitofp_4i64_to_4f32: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_1 ; SSE-NEXT: # BB#2: @@ -2159,7 +2159,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB47_3: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_4 ; SSE-NEXT: # BB#5: @@ -2174,7 +2174,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB47_6: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_7 ; SSE-NEXT: # BB#8: @@ -2192,7 +2192,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE-NEXT: .LBB47_9: ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_10 ; SSE-NEXT: # BB#11: @@ -2591,10 +2591,10 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; SSE-LABEL: sitofp_load_2i64_to_2f64: ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2733,18 +2733,18 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3382,17 +3382,17 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3549,34 +3549,34 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movd %xmm4, %rax +; SSE-NEXT: movq %xmm4, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -3824,7 +3824,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_1 ; SSE-NEXT: # BB#2: @@ -3838,7 +3838,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB76_3: -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_4 ; SSE-NEXT: # BB#5: @@ -3853,7 +3853,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB76_6: ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_7 ; SSE-NEXT: # BB#8: @@ -3871,7 +3871,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-NEXT: .LBB76_9: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_10 ; SSE-NEXT: # BB#11: @@ -4190,7 +4190,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movd %xmm5, %rax +; SSE-NEXT: movq %xmm5, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_1 ; SSE-NEXT: # BB#2: @@ -4204,7 +4204,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: addss %xmm4, %xmm4 ; SSE-NEXT: .LBB80_3: -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_4 ; SSE-NEXT: # BB#5: @@ -4219,7 +4219,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB80_6: ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE-NEXT: movd %xmm5, %rax +; SSE-NEXT: movq %xmm5, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_7 ; SSE-NEXT: # BB#8: @@ -4234,7 +4234,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: addss %xmm6, %xmm6 ; SSE-NEXT: .LBB80_9: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_10 ; SSE-NEXT: # BB#11: @@ -4250,7 +4250,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: addss %xmm5, %xmm5 ; SSE-NEXT: .LBB80_12: -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_13 ; SSE-NEXT: # BB#14: @@ -4264,7 +4264,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: addss %xmm7, %xmm7 ; SSE-NEXT: .LBB80_15: -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_16 ; SSE-NEXT: # BB#17: @@ -4283,7 +4283,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movd %xmm3, %rax +; SSE-NEXT: movq %xmm3, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_19 ; SSE-NEXT: # BB#20: @@ -4302,7 +4302,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_22 ; SSE-NEXT: # BB#23: diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll index 560e5c568faf0..7a4326c01bb7d 100644 --- a/test/CodeGen/X86/vec_set-8.ll +++ b/test/CodeGen/X86/vec_set-8.ll @@ -4,7 +4,7 @@ define <2 x i64> @test(i64 %i) nounwind { ; CHECK-LABEL: test: ; CHECK: # BB#0: -; CHECK-NEXT: movd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: retq %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll index b08f96038ff14..994bc2b3056ed 100644 --- a/test/CodeGen/X86/vec_set-C.ll +++ b/test/CodeGen/X86/vec_set-C.ll @@ -10,7 +10,7 @@ define <2 x i64> @t1(i64 %x) nounwind { ; ; X64-LABEL: t1: ; X64: # BB#0: -; X64-NEXT: movd %rdi, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: retq %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0 ret <2 x i64> %tmp8 diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll index 64c64c3925441..c13299b9cb385 100644 --- a/test/CodeGen/X86/vec_shift7.ll +++ b/test/CodeGen/X86/vec_shift7.ll @@ -17,7 +17,7 @@ define i64 @test1(<2 x i64> %a) { ; ; X64-LABEL: test1: ; X64: # BB#0: # %entry -; X64-NEXT: movd %xmm0, %rax +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: retq entry: %c = shl <2 x i64> %a, <i64 0, i64 2> diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll index 316df2780d16e..202b8f7786b80 100644 --- a/test/CodeGen/X86/vector-compare-all_of.ll +++ b/test/CodeGen/X86/vector-compare-all_of.ll @@ -10,7 +10,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; SSE-NEXT: cmpltpd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: @@ -46,7 +46,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; SSE-NEXT: andpd %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -285,7 +285,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: @@ -321,7 +321,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll index 1d3db6495708f..043ba28e8fa40 100644 --- a/test/CodeGen/X86/vector-compare-any_of.ll +++ b/test/CodeGen/X86/vector-compare-any_of.ll @@ -10,7 +10,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; SSE-NEXT: cmpltpd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: @@ -46,7 +46,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; SSE-NEXT: orpd %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -267,7 +267,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: @@ -303,7 +303,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 895bf5c0f02d1..2b5eb695f53ea 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -11,22 +11,22 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_div7_2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: movd %rdx, %xmm1 +; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: movd %rdx, %xmm0 +; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -40,14 +40,14 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: sarq %rdx ; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: movd %rdx, %xmm1 -; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: imulq %rcx ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: sarq %rdx ; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: movd %rdx, %xmm0 +; SSE41-NEXT: movq %rdx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -275,7 +275,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rsi @@ -286,9 +286,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: leaq (,%rdx,8), %rax ; SSE2-NEXT: subq %rdx, %rax ; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: movd %rcx, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rsi ; SSE2-NEXT: movq %rdx, %rax @@ -298,7 +298,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: leaq (,%rdx,8), %rax ; SSE2-NEXT: subq %rdx, %rax ; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: movd %rcx, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -316,8 +316,8 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: leaq (,%rdx,8), %rax ; SSE41-NEXT: subq %rdx, %rax ; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: movd %rcx, %xmm1 -; SSE41-NEXT: movd %xmm0, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: imulq %rsi ; SSE41-NEXT: movq %rdx, %rax @@ -327,7 +327,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: leaq (,%rdx,8), %rax ; SSE41-NEXT: subq %rdx, %rax ; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: movd %rcx, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index 1b35e2fdddae0..cd17fcf8c85b4 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -11,7 +11,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_div7_2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi @@ -19,16 +19,16 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: addq %rdx, %rcx ; SSE2-NEXT: shrq $2, %rcx -; SSE2-NEXT: movd %rcx, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: subq %rdx, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: addq %rdx, %rcx ; SSE2-NEXT: shrq $2, %rcx -; SSE2-NEXT: movd %rcx, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -43,15 +43,15 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: addq %rdx, %rcx ; SSE41-NEXT: shrq $2, %rcx -; SSE41-NEXT: movd %rcx, %xmm1 -; SSE41-NEXT: movd %xmm0, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: subq %rdx, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: addq %rdx, %rcx ; SSE41-NEXT: shrq $2, %rcx -; SSE41-NEXT: movd %rcx, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -255,7 +255,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi @@ -267,9 +267,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: leaq (,%rax,8), %rdx ; SSE2-NEXT: subq %rax, %rdx ; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: movd %rcx, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rcx, %rax @@ -280,7 +280,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: leaq (,%rax,8), %rdx ; SSE2-NEXT: subq %rax, %rdx ; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: movd %rcx, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -299,8 +299,8 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: leaq (,%rax,8), %rdx ; SSE41-NEXT: subq %rax, %rdx ; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: movd %rcx, %xmm1 -; SSE41-NEXT: movd %xmm0, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rcx, %rax @@ -311,7 +311,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: leaq (,%rax,8), %rdx ; SSE41-NEXT: subq %rax, %rdx ; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: movd %rcx, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll index 9e11edcc29dc5..f1f795bf3cb03 100644 --- a/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1579,7 +1579,7 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-LABEL: foldv2i64: ; SSE: # BB#0: ; SSE-NEXT: movl $55, %eax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64: @@ -1607,7 +1607,7 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-LABEL: foldv2i64u: ; SSE: # BB#0: ; SSE-NEXT: movl $55, %eax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64u: diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll index 0718edf5a1433..f05588a2920c7 100644 --- a/test/CodeGen/X86/vector-pcmp.ll +++ b/test/CodeGen/X86/vector-pcmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 @@ -19,7 +19,6 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %x) { ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; %sign = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> %not = xor <16 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ret <16 x i8> %not @@ -37,7 +36,6 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %x) { ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; %sign = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %not = xor <8 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ret <8 x i16> %not @@ -55,7 +53,6 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %x) { ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; %sign = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> %not = xor <4 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1> ret <4 x i32> %not @@ -81,7 +78,6 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) { ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; %sign = ashr <2 x i64> %x, <i64 63, i64 63> %not = xor <2 x i64> %sign, <i64 -1, i64 -1> ret <2 x i64> %not @@ -91,23 +87,23 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) { ; SSE2-LABEL: test_strange_type: ; SSE2: # BB#0: ; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %rsi, %xmm0 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: notq %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: movq %rsi, %rdx ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_strange_type: ; SSE42: # BB#0: ; SSE42-NEXT: sarq $63, %rsi -; SSE42-NEXT: movd %rsi, %xmm0 +; SSE42-NEXT: movq %rsi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: movd %xmm1, %rax +; SSE42-NEXT: movq %xmm1, %rax ; SSE42-NEXT: pextrq $1, %xmm1, %rdx ; SSE42-NEXT: retq ; @@ -132,7 +128,6 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) { ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx ; AVX2-NEXT: retq -; %sign = ashr <1 x i128> %x, <i128 127> %not = xor <1 x i128> %sign, <i128 -1> ret <1 x i128> %not @@ -163,7 +158,6 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %sign = ashr <32 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> %not = xor <32 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ret <32 x i8> %not @@ -193,7 +187,6 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %sign = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %not = xor <16 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ret <16 x i16> %not @@ -223,7 +216,6 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %sign = ashr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> %not = xor <8 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> ret <8 x i32> %not @@ -266,7 +258,6 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %sign = ashr <4 x i64> %x, <i64 63, i64 63, i64 63, i64 63> %not = xor <4 x i64> %sign, <i64 -1, i64 -1, i64 -1, i64 -1> ret <4 x i64> %not @@ -284,7 +275,6 @@ define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq -; %cmp = icmp eq <16 x i8> %a, %b %zext = zext <16 x i1> %cmp to <16 x i8> ret <16 x i8> %zext @@ -314,7 +304,6 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %cmp = icmp eq <16 x i16> %a, %b %zext = zext <16 x i1> %cmp to <16 x i16> ret <16 x i16> %zext @@ -332,7 +321,6 @@ define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX-NEXT: retq -; %cmp = icmp eq <4 x i32> %a, %b %zext = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %zext @@ -375,7 +363,6 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %cmp = icmp eq <4 x i64> %a, %b %zext = zext <4 x i1> %cmp to <4 x i64> ret <4 x i64> %zext @@ -406,7 +393,6 @@ define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq -; %cmp = icmp sgt <32 x i8> %a, %b %zext = zext <32 x i1> %cmp to <32 x i8> ret <32 x i8> %zext @@ -424,7 +410,6 @@ define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX-NEXT: retq -; %cmp = icmp sgt <8 x i16> %a, %b %zext = zext <8 x i1> %cmp to <8 x i16> ret <8 x i16> %zext @@ -454,7 +439,6 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX2-NEXT: retq -; %cmp = icmp sgt <8 x i32> %a, %b %zext = zext <8 x i1> %cmp to <8 x i32> ret <8 x i32> %zext @@ -488,7 +472,6 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX-NEXT: retq -; %cmp = icmp sgt <2 x i64> %a, %b %zext = zext <2 x i1> %cmp to <2 x i64> ret <2 x i64> %zext diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index e9f1d1d8522b3..8cc1d8c765ac3 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1207,10 +1207,10 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shlq $62, %rcx ; SSE-NEXT: sarq $63, %rcx -; SSE-NEXT: movd %rcx, %xmm1 +; SSE-NEXT: movq %rcx, %xmm1 ; SSE-NEXT: shlq $63, %rax ; SSE-NEXT: sarq $63, %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -1687,28 +1687,28 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movsbq 1(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movsbq 3(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movq %rax, %xmm2 ; SSE2-NEXT: movsbq 2(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i8_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movsbq 1(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: movq %rax, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: movsbq 3(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movq %rax, %xmm2 ; SSSE3-NEXT: movsbq 2(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; @@ -2038,48 +2038,48 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; SSE2-LABEL: load_sext_8i8_to_8i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movsbq 1(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movsbq 3(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movq %rax, %xmm2 ; SSE2-NEXT: movsbq 2(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: movsbq 5(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm3 +; SSE2-NEXT: movq %rax, %xmm3 ; SSE2-NEXT: movsbq 4(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movq %rax, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movsbq 7(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm4 +; SSE2-NEXT: movq %rax, %xmm4 ; SSE2-NEXT: movsbq 6(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm3 +; SSE2-NEXT: movq %rax, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_8i8_to_8i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movsbq 1(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: movq %rax, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: movsbq 3(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movq %rax, %xmm2 ; SSSE3-NEXT: movsbq 2(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: movsbq 5(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm3 +; SSSE3-NEXT: movq %rax, %xmm3 ; SSSE3-NEXT: movsbq 4(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movq %rax, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSSE3-NEXT: movsbq 7(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm4 +; SSSE3-NEXT: movq %rax, %xmm4 ; SSSE3-NEXT: movsbq 6(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm3 +; SSSE3-NEXT: movq %rax, %xmm3 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSSE3-NEXT: retq ; @@ -4542,28 +4542,28 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_4i16_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movswq 2(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movswq 6(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movq %rax, %xmm2 ; SSE2-NEXT: movswq 4(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i16_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movswq 2(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: movq %rax, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: movswq 6(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movq %rax, %xmm2 ; SSSE3-NEXT: movswq 4(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index d0ead653b203d..e38d3f9744852 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -986,7 +986,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) { define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { ; SSE-LABEL: insert_reg_and_zero_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movd %rdi, %xmm0 +; SSE-NEXT: movq %rdi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_and_zero_v2i64: @@ -1048,25 +1048,25 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) { define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) { ; SSE2-LABEL: insert_reg_lo_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movq %rdi, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v2i64: ; SSE3: # BB#0: -; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movq %rdi, %xmm1 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v2i64: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movq %rdi, %xmm1 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: movq %rdi, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; @@ -1140,7 +1140,7 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) { define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { ; SSE-LABEL: insert_reg_hi_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 3e9e980a19730..e9c0d0962ab3e 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2104,25 +2104,25 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { ; SSE2-LABEL: insert_reg_lo_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movq %rdi, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v4i32: ; SSE3: # BB#0: -; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movq %rdi, %xmm1 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v4i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movq %rdi, %xmm1 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: movq %rdi, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; @@ -2191,7 +2191,7 @@ define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) { ; SSE-LABEL: insert_reg_hi_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 1385929ab8cd3..202acbcd35007 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -879,3 +879,29 @@ define <32 x i8> @constant_fold_pshufb_256() { %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>) ret <32 x i8> %1 } + +define <32 x i8> @PR27320(<8 x i32> %a0) { +; X32-LABEL: PR27320: +; X32: # BB#0: +; X32-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: vextracti128 $1, %ymm0, %xmm2 +; X32-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7] +; X32-NEXT: vpor %xmm1, %xmm2, %xmm1 +; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11] +; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: PR27320: +; X64: # BB#0: +; X64-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vextracti128 $1, %ymm0, %xmm2 +; X64-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7] +; X64-NEXT: vpor %xmm1, %xmm2, %xmm1 +; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11] +; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef> + %2 = bitcast <8 x i32> %1 to <32 x i8> + %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27> + ret <32 x i8> %3 +} diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index ab34ad6a613cc..a5fac9ac6a41e 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -1257,7 +1257,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v4i64_v4i32: ; SSE: # BB#0: ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 @@ -1301,7 +1301,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: ; SSE: # BB#0: ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 +; SSE-NEXT: movq %rax, %xmm4 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] ; SSE-NEXT: psubq %xmm4, %xmm0 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 @@ -1418,7 +1418,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v16i64_v16i8: ; SSE: # BB#0: ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 +; SSE-NEXT: movq %rax, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] ; SSE-NEXT: psubq %xmm8, %xmm0 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 @@ -2411,7 +2411,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-NEXT: psllq $32, %xmm1 ; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 @@ -2554,7 +2554,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v16i64_v16i8: ; SSE: # BB#0: ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 +; SSE-NEXT: movq %rax, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pmuludq %xmm8, %xmm9 diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index d39a90b066f5e..58f7407eeec4e 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -906,7 +906,7 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) { ; SSE-LABEL: trunc2i64_i64: ; SSE: # BB#0: # %entry ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: trunc2i64_i64: @@ -1031,19 +1031,19 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc4i32_i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc4i32_i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: retq ; ; AVX-LABEL: trunc4i32_i64: @@ -1158,19 +1158,19 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) { ; SSE2: # BB#0: # %entry ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc8i16_i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc8i16_i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: retq ; ; AVX-LABEL: trunc8i16_i64: diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 56f634c4188fd..22d0065b264fc 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -1249,7 +1249,7 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-LABEL: foldv2i64: ; SSE: # BB#0: ; SSE-NEXT: movl $8, %eax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64: @@ -1271,7 +1271,7 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-LABEL: foldv2i64u: ; SSE: # BB#0: ; SSE-NEXT: movl $8, %eax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64u: diff --git a/test/CodeGen/X86/vmovq.ll b/test/CodeGen/X86/vmovq.ll index 45d350c743e25..5c1ff7d06ee0b 100644 --- a/test/CodeGen/X86/vmovq.ll +++ b/test/CodeGen/X86/vmovq.ll @@ -6,7 +6,7 @@ define <2 x i64> @PR25554(<2 x i64> %v0, <2 x i64> %v1) { ; SSE-LABEL: PR25554: ; SSE: # BB#0: ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; SSE-NEXT: paddq %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll index 7ad5706592e42..c9a34de123692 100644 --- a/test/CodeGen/X86/vshift-1.ll +++ b/test/CodeGen/X86/vshift-1.ll @@ -39,7 +39,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind { ; ; X64-LABEL: shift1b: ; X64: # BB#0: # %entry -; X64-NEXT: movd %rsi, %xmm1 +; X64-NEXT: movq %rsi, %xmm1 ; X64-NEXT: psllq %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll index f79fc5bff9687..88cba8a4d6ac8 100644 --- a/test/CodeGen/X86/vshift-2.ll +++ b/test/CodeGen/X86/vshift-2.ll @@ -39,7 +39,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind { ; ; X64-LABEL: shift1b: ; X64: # BB#0: # %entry -; X64-NEXT: movd %rsi, %xmm1 +; X64-NEXT: movq %rsi, %xmm1 ; X64-NEXT: psrlq %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll index e62698221973f..f844904c86905 100644 --- a/test/CodeGen/X86/vsplit-and.ll +++ b/test/CodeGen/X86/vsplit-and.ll @@ -23,13 +23,13 @@ define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind read define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly { ; CHECK-LABEL: t2: ; CHECK: # BB#0: -; CHECK-NEXT: movd %r9, %xmm1 -; CHECK-NEXT: movd %r8, %xmm0 +; CHECK-NEXT: movq %r9, %xmm1 +; CHECK-NEXT: movq %r8, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: movd %rdx, %xmm2 -; CHECK-NEXT: movd %rsi, %xmm1 +; CHECK-NEXT: movq %rdx, %xmm2 +; CHECK-NEXT: movq %rsi, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; CHECK-NEXT: movd %rcx, %xmm2 +; CHECK-NEXT: movq %rcx, %xmm2 ; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: pxor %xmm4, %xmm4 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm2 diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll index a973fdaa8d601..986fa4743c6c2 100644 --- a/test/CodeGen/X86/widen_cast-5.ll +++ b/test/CodeGen/X86/widen_cast-5.ll @@ -16,7 +16,7 @@ define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind { ; ; X64-LABEL: convert: ; X64: ## BB#0: ## %entry -; X64-NEXT: movd %rsi, %xmm0 +; X64-NEXT: movq %rsi, %xmm0 ; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll index 504485440efff..3b20f3515716c 100644 --- a/test/CodeGen/X86/widen_conv-3.ll +++ b/test/CodeGen/X86/widen_conv-3.ll @@ -105,7 +105,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # BB#0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax -; X64-SSE2-NEXT: movd %rax, %xmm0 +; X64-SSE2-NEXT: movq %rax, %xmm0 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) @@ -129,7 +129,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE42: # BB#0: # %entry ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax ; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movd %rcx, %xmm0 +; X64-SSE42-NEXT: movq %rcx, %xmm0 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 ; X64-SSE42-NEXT: pslld $24, %xmm0 diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll index ef56692e947ce..6dc938893d384 100644 --- a/test/CodeGen/X86/widen_conv-4.ll +++ b/test/CodeGen/X86/widen_conv-4.ll @@ -130,7 +130,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # BB#0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax -; X64-SSE2-NEXT: movd %rax, %xmm0 +; X64-SSE2-NEXT: movq %rax, %xmm0 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) @@ -154,7 +154,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE42: # BB#0: # %entry ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax ; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movd %rcx, %xmm0 +; X64-SSE42-NEXT: movq %rcx, %xmm0 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 ; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0 |
