diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
commit | 6b3f41ed88e8e440e11a4fbf20b6600529f80049 (patch) | |
tree | 928b056f24a634d628c80238dbbf10d41b1a71d5 /test/CodeGen | |
parent | c46e6a5940c50058e00c0c5f9123fd82e338d29a (diff) | |
download | src-6b3f41ed88e8e440e11a4fbf20b6600529f80049.tar.gz src-6b3f41ed88e8e440e11a4fbf20b6600529f80049.zip |
Notes
Diffstat (limited to 'test/CodeGen')
209 files changed, 15509 insertions, 6576 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 739fdd5cb4c5..0f054f1d940c 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -74,6 +74,21 @@ %res = bitcast <2 x i32> %vres to i64 ret i64 %res } + + define i64 @floatingPointLoad(i64 %arg1, double* %addr) { + %varg1 = bitcast i64 %arg1 to double + %varg2 = load double, double* %addr + %vres = fadd double %varg1, %varg2 + %res = bitcast double %vres to i64 + ret i64 %res + } + + define void @floatingPointStore(i64 %arg1, double* %addr) { + %varg1 = bitcast i64 %arg1 to double + %vres = fadd double %varg1, %varg1 + store double %vres, double* %addr + ret void + } ... --- @@ -650,3 +665,84 @@ body: | RET_ReallyLR implicit %x0 ... + +--- +# Make sure we map what looks like floating point +# loads to floating point register bank. +# CHECK-LABEL: name: floatingPointLoad +name: floatingPointLoad +legalized: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: fpr } +# CHECK-NEXT: - { id: 3, class: fpr } +# CHECK-NEXT: - { id: 4, class: fpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + +# No repairing should be necessary for both modes. +# CHECK: %0(s64) = COPY %x0 +# CHECK-NEXT: %1(p0) = COPY %x1 +# CHECK-NEXT: %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr) +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %4(s64) = COPY %0 +# CHECK-NEXT: %3(s64) = G_FADD %4, %2 +# CHECK-NEXT: %x0 = COPY %3(s64) +# CHECK-NEXT: RET_ReallyLR implicit %x0 + +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(p0) = COPY %x1 + %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr) + %3(s64) = G_FADD %0, %2 + %x0 = COPY %3(s64) + RET_ReallyLR implicit %x0 + +... + +--- +# Make sure we map what looks like floating point +# stores to floating point register bank. +# CHECK-LABEL: name: floatingPointStore +name: floatingPointStore +legalized: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: fpr } +# CHECK-NEXT: - { id: 3, class: fpr } +# CHECK-NEXT: - { id: 4, class: fpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + +# CHECK: %0(s64) = COPY %x0 +# CHECK-NEXT: %1(p0) = COPY %x1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3(s64) = COPY %0 +# CHECK-NEXT: %4(s64) = COPY %0 +# CHECK-NEXT: %2(s64) = G_FADD %3, %4 +# CHECK-NEXT: G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr) +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(p0) = COPY %x1 + %2(s64) = G_FADD %0, %0 + G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr) + RET_ReallyLR + +... diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll index f8d95c88cc8f..44705a9c9f65 100644 --- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll +++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s ; CHECK-LABEL: name: test_trivial_call -; CHECK: ADJCALLSTACKDOWN 0, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def %sp, implicit %sp ; CHECK: BL @trivial_callee, csr_aarch64_aapcs, implicit-def %lr ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def %sp, implicit %sp declare void @trivial_callee() @@ -186,7 +186,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) { ; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42 ; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12 ; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0 -; CHECK: ADJCALLSTACKDOWN 24, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 24, 0, implicit-def %sp, implicit %sp ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp ; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64) diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index 2682fa7dcce1..fc1aeb7b37d9 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -378,11 +378,11 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) { ; CHECK-NEXT: cmp x0, #13 ; CHECK-NOT: ccmp ; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt +; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: cmp x2, #2 ; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt ; CHECK-NEXT: cmp x2, #4 ; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt -; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]] ; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]] ; CHECK-NEXT: cmp [[REG6]], #0 diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll index 840d1dcbf060..f97498825279 100644 --- a/test/CodeGen/AArch64/arm64-fml-combines.ll +++ b/test/CodeGen/AArch64/arm64-fml-combines.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -fp-contract=fast | FileCheck %s + define void @foo_2d(double* %src) { entry: %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 @@ -126,3 +128,23 @@ for.body: ; preds = %for.body, %entry for.end: ; preds = %for.body ret void } + +; CHECK-LABEL: test1: +; CHECK: fnmadd s0, s0, s1, s2 +define float @test1(float %a, float %b, float %c) { +entry: + %0 = fmul float %a, %b + %mul = fsub float -0.000000e+00, %0 + %sub1 = fsub float %mul, %c + ret float %sub1 +} + +; CHECK-LABEL: test2: +; CHECK: fnmadd d0, d0, d1, d2 +define double @test2(double %a, double %b, double %c) { +entry: + %0 = fmul double %a, %b + %mul = fsub double -0.000000e+00, %0 + %sub1 = fsub double %mul, %c + ret double %sub1 +} diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll index caaf8615cd4a..a8d1c2482520 100644 --- a/test/CodeGen/AArch64/arm64-hello.ll +++ b/test/CodeGen/AArch64/arm64-hello.ll @@ -6,8 +6,8 @@ ; CHECK-NEXT: stp x29, x30, [sp, #16] ; CHECK-NEXT: add x29, sp, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK: adrp x0, L_.str@PAGE -; CHECK: add x0, x0, L_.str@PAGEOFF +; CHECK: adrp x0, l_.str@PAGE +; CHECK: add x0, x0, l_.str@PAGEOFF ; CHECK-NEXT: bl _puts ; CHECK-NEXT: ldp x29, x30, [sp, #16] ; CHECK-NEXT: add sp, sp, #32 diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll index 3593668e0156..4c0195b93a44 100644 --- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll +++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll @@ -12,7 +12,7 @@ ; CHECK: Successors: ; CHECK-NOT: ch SU(4) ; CHECK: SU(3) -; CHECK: SU(4): STRWui %WZR, %X{{[0-9]+}} +; CHECK: SU(5): STRWui %WZR, %X{{[0-9]+}} define i32 @foo() { entry: %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4 diff --git a/test/CodeGen/AArch64/macho-global-symbols.ll b/test/CodeGen/AArch64/macho-global-symbols.ll new file mode 100644 index 000000000000..d68abad57ccd --- /dev/null +++ b/test/CodeGen/AArch64/macho-global-symbols.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s + +; All global symbols must be at-most linker-private for AArch64 because we don't +; use section-relative relocations in MachO. + +define i8* @private_sym() { +; CHECK-LABEL: private_sym: +; CHECK: adrp [[HIBITS:x[0-9]+]], l_var@PAGE +; CHECK: add x0, [[HIBITS]], l_var@PAGEOFF + + ret i8* getelementptr([2 x i8], [2 x i8]* @var, i32 0, i32 0) +} + +; CHECK: .section __TEXT,__cstring +; CHECK: l_var: +; CHECK: .asciz "\002" +@var = private unnamed_addr constant [2 x i8] [i8 2, i8 0] diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll index f29dfb3a9802..4c682e594e66 100644 --- a/test/CodeGen/AArch64/misched-fusion-aes.ll +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -1,4 +1,5 @@ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -87,6 +88,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]] ; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]] ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} @@ -187,6 +204,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]] ; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]] ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll index 5646703fa403..677ff8dc2530 100644 --- a/test/CodeGen/AArch64/stackmap-frame-setup.ll +++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN 0, implicit-def +; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def ret void diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir index 56a9e7022db9..2a3d3887ed69 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -14,7 +14,7 @@ regBankSelected: true # GCN: global_addrspace # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 -# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0, 0 +# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0 body: | bb.0: diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir index ea435725bf25..89be3bde94a8 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -15,7 +15,7 @@ regBankSelected: true # GCN: global_addrspace # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 # GCN: [[VAL:%[0-9]+]] = COPY %vgpr2 -# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0 +# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0 body: | bb.0: diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir new file mode 100644 index 000000000000..8839ba8e0ab2 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir @@ -0,0 +1,20 @@ +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + define void @test_constant() { + entry: + ret void + } +... + +--- +name: test_constant +registers: + - { id: 0, class: _ } +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_constant + ; CHECK: %0(s32) = G_CONSTANT i32 5 + + %0(s32) = G_CONSTANT i32 5 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg new file mode 100644 index 000000000000..e99d1bb8446c --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'global-isel' in config.root.available_features: + config.unsupported = True diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 62b47beb1251..bc992ed77ffd 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -219,19 +219,19 @@ body: | %34 = V_MOV_B32_e32 63, implicit %exec %27 = V_AND_B32_e64 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %27, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_AND_B32_e64 %24, %26, implicit %exec - FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %29 = V_AND_B32_e32 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %29, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %30 = V_AND_B32_e64 %26, %26, implicit %exec - FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %30, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %31 = V_AND_B32_e64 %34, %34, implicit %exec - FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %31, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -407,34 +407,34 @@ body: | %27 = S_MOV_B32 -4 %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_LSHL_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_LSHL_B32_e64 12, %7, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_LSHL_B32_e64 12, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_LSHL_B32_e64 %6, 12, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_LSHL_B32_e64 %6, 32, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_LSHL_B32_e32 %6, %6, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_LSHL_B32_e32 %27, %6, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -615,34 +615,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_ASHR_I32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_ASHR_I32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_ASHR_I32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_ASHR_I32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_ASHR_I32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_ASHR_I32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_ASHR_I32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -824,34 +824,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_LSHR_B32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_LSHR_B32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_LSHR_B32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_LSHR_B32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_LSHR_B32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_LSHR_B32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_LSHR_B32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 0831d250b9e7..8611cd080e15 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_or_0: -; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) { @@ -50,7 +50,7 @@ define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_xor_0: -; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { @@ -86,8 +86,8 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_not_0: -; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]] ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} @@ -104,8 +104,8 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN: buffer_load_dwordx2 ; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}} -; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index a29e72ea57cb..aa913ad406d2 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) ; XXX - Why 0 in register? ; FUNC-LABEL: {{^}}v_ctpop_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -40,9 +40,9 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: ; GCN: buffer_load_dword [[VAL1:v[0-9]+]], ; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -61,7 +61,7 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: ; GCN: buffer_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt -; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} +; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { @@ -73,8 +73,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctpop_v2i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -87,10 +87,10 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v4i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -105,14 +105,14 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v8i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -131,22 +131,22 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v16i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -189,7 +189,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -206,7 +206,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { @@ -220,7 +220,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -236,7 +236,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}} ; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index 2610684ad9ee..f18bd9fd8174 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -26,9 +26,9 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) ; FUNC-LABEL: {{^}}v_ctpop_i64: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { @@ -41,9 +41,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} @@ -171,11 +171,11 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; FUNC-LABEL: {{^}}v_ctpop_i128: ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 -; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 +; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] -; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 -; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 1c0e9a2f13ce..66bf9d0ffb00 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] -; GCN: buffer_store_dword [[MUL]] +; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 +; GCN-NEXT: buffer_store_dword [[ADD]] +; GCN-NEXT: buffer_store_dword [[MUL]] define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index 626a0b50cce8..ed36666db807 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ret void } +; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} +; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} + +; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] +; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] +define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + + %fmul0 = fmul half %elt0, 4.0 + %fadd1 = fadd half %elt1, 2.0 + store volatile half %fmul0, half addrspace(1)* undef + store volatile half %fadd1, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] +; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] +define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + store volatile half %elt0, half addrspace(1)* undef + store volatile half %elt1, half addrspace(1)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir index c6fe6debd225..ff9fcd1c693f 100644 --- a/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -246,15 +246,15 @@ body: | S_BRANCH %bb.1 bb.1: - FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec S_ENDPGM diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir index 106a96e32dc3..a0d2d6c097a2 100644 --- a/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -57,15 +57,15 @@ body: | %4.sub1 = COPY %3.sub0 undef %5.sub0 = COPY %4.sub1 %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, implicit %exec, implicit %flat_scr %6 = IMPLICIT_DEF undef %7.sub0_sub1 = COPY %6 %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, implicit %exec, implicit %flat_scr %8 = IMPLICIT_DEF undef %9.sub0_sub1_sub2 = COPY %8 %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, implicit %exec, implicit %flat_scr ... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index b92eb34750d9..7179d02fc6dd 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -7,7 +7,7 @@ ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]] define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32: ; GCN: s_load_dword [[X:s[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out @@ -39,7 +39,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0 +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -70,7 +70,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, 1.0, [[A]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -103,7 +103,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -140,7 +140,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll index ab76c870796b..144c8f428ab0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mbcnt_intrinsics: -; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] -; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] +; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]] define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) { main_body: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index eb4066a2a0a8..5f1fb0e2d732 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -63,7 +63,7 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -198,7 +198,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: buffer_load_dword [[VGPR:v[0-9]+]] -; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll index 9c43a6dc60f4..d7655993a2d9 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -1,26 +1,26 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s ; CHECK-LABEL: @volatile_load( -; CHECK: alloca [5 x i32] +; CHECK: alloca [4 x i32] ; CHECK: load volatile i32, i32* define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [4 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp %load = load volatile i32, i32* %arrayidx1 store i32 %load, i32 addrspace(1)* %out ret void } ; CHECK-LABEL: @volatile_store( -; CHECK: alloca [5 x i32] +; CHECK: alloca [4 x i32] ; CHECK: store volatile i32 %tmp, i32* define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [4 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp store volatile i32 %tmp, i32* %arrayidx1 ret void } diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll index bfb10503aaea..0148ff470b78 100644 --- a/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}madak_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} +; VI: v_madak_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} ; VI: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @madak_f16( diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir index 38662e83b359..f754415dccb4 100644 --- a/test/CodeGen/AMDGPU/waitcnt.mir +++ b/test/CodeGen/AMDGPU/waitcnt.mir @@ -51,21 +51,21 @@ name: flat_zero_waitcnt body: | bb.0: successors: %bb.1 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4) - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4) + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_BRANCH %bb.1 bb.1: successors: %bb.2 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_BRANCH %bb.2 bb.2: - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4) - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4) + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_ENDPGM ... @@ -86,11 +86,11 @@ name: single_fallthrough_successor_no_end_block_wait body: | bb.0: successors: %bb.1 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr bb.1: %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec - FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM ... --- @@ -114,15 +114,15 @@ name: single_branch_successor_not_next_block body: | bb.0: successors: %bb.2 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr S_BRANCH %bb.2 bb.1: - FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM bb.2: %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec - FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 83ab2659ef4a..72c3b715d36e 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -4,6 +4,8 @@ define void @test_sext_s1() { ret void } define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_anyext_s8() { ret void } + define void @test_anyext_s16() { ret void } define void @test_trunc_s32_16() { ret void } @@ -149,6 +151,58 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_anyext_s8 +# CHECK-LABEL: name: test_anyext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_anyext_s16 +# CHECK-LABEL: name: test_anyext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_trunc_s32_16 # CHECK-LABEL: name: test_trunc_s32_16 legalized: true @@ -187,9 +241,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -200,11 +260,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_ADD %0, %1 - ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGSUM]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGSUMTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -220,9 +289,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -233,11 +308,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_ADD %0, %1 - ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGSUM]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGSUMTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -352,9 +436,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -365,11 +455,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_SUB %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_SUB %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -385,9 +484,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -398,11 +503,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_SUB %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_SUB %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -451,9 +565,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } -# CHECK-DAG: id: 0, class: gprnopc -# CHECK-DAG: id: 1, class: gprnopc + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gprnopc +# CHECK-DAG: id: 3, class: gprnopc +# CHECK-DAG: id: 4, class: gprnopc +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -464,11 +584,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_MUL %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_MUL %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -484,9 +613,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } -# CHECK-DAG: id: 0, class: gprnopc -# CHECK-DAG: id: 1, class: gprnopc + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gprnopc +# CHECK-DAG: id: 3, class: gprnopc +# CHECK-DAG: id: 4, class: gprnopc +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -497,11 +632,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_MUL %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_MUL %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index 44fe7410b42c..53577dbd76f6 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -421,7 +421,7 @@ entry: define arm_aapcscc void @test_indirect_call(void() *%fptr) { ; CHECK-LABEL: name: test_indirect_call ; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp entry: @@ -433,7 +433,7 @@ declare arm_aapcscc void @call_target() define arm_aapcscc void @test_direct_call() { ; CHECK-LABEL: name: test_direct_call -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp entry: @@ -447,7 +447,7 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) { ; CHECK-LABEL: name: test_call_simple_reg_params ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: %r1 = COPY [[AVREG]] ; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0 @@ -466,7 +466,7 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) { ; CHECK-LABEL: name: test_call_simple_stack_params ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 -; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: %r1 = COPY [[AVREG]] ; CHECK-DAG: %r2 = COPY [[BVREG]] @@ -496,7 +496,7 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) { ; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1 ; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2 -; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 20, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8) ; CHECK: %r0 = COPY [[SEXTA]] ; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8) @@ -547,7 +547,7 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) { ; CHECK-LABEL: name: test_call_vfpcc_fp_params ; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %s0 = COPY [[BVREG]] ; CHECK-DAG: %d1 = COPY [[AVREG]] ; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0 @@ -569,7 +569,7 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) { ; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32 ; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2 -; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0 ; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32 @@ -608,7 +608,7 @@ declare arm_aapcscc float @different_call_conv_target(float) define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) { ; CHECK-LABEL: name: test_call_different_call_conv ; CHECK: [[X:%[0-9]+]](s32) = COPY %s0 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: %r0 = COPY [[X]] ; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0 ; CHECK: [[R:%[0-9]+]](s32) = COPY %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index 625d35acf17b..f6ac92597cb2 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -91,8 +91,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_ADD %0, %1 - ; G_ADD with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} + ; G_ADD with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -115,8 +116,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_ADD %0, %1 - ; G_ADD with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} + ; G_ADD with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 @@ -165,8 +167,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_SUB %0, %1 - ; G_SUB with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} + ; G_SUB with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -189,8 +192,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_SUB %0, %1 - ; G_SUB with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} + ; G_SUB with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 @@ -239,8 +243,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_MUL %0, %1 - ; G_MUL with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} + ; G_MUL with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -263,8 +268,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_MUL %0, %1 - ; G_MUL with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} + ; G_MUL with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 4e94fb4e3481..dfccc47c277c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -25,6 +25,9 @@ define void @test_constants() { ret void } + define void @test_anyext_s8_32() { ret void } + define void @test_anyext_s16_32() { ret void } + define void @test_trunc_s32_16() { ret void } define void @test_fadd_s32() #0 { ret void } @@ -71,19 +74,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_ADD %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_ADD %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -97,19 +109,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_ADD %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_ADD %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -123,19 +144,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s1) = COPY %r0 %1(s1) = COPY %r1 - %2(s1) = G_ADD %0, %1 - %r0 = COPY %2(s1) + %2(s32) = G_ANYEXT %0(s1) + %3(s32) = G_ANYEXT %1(s1) + %4(s32) = G_ADD %2, %3 + %5(s1) = G_TRUNC %4(s32) + %r0 = COPY %5(s1) BX_RET 14, _, implicit %r0 ... @@ -175,19 +205,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_SUB %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_SUB %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -201,19 +240,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_SUB %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_SUB %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -253,19 +301,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_MUL %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_MUL %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -279,19 +336,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_MUL %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_MUL %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -500,6 +566,48 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_anyext_s8_32 +# CHECK-LABEL: name: test_anyext_s8_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + %1(s32) = G_ANYEXT %0(s8) + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_anyext_s16_32 +# CHECK-LABEL: name: test_anyext_s16_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + %1(s32) = G_ANYEXT %0(s16) + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 +... +--- name: test_trunc_s32_16 # CHECK-LABEL: name: test_trunc_s32_16 legalized: true diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll index ce5a1df05e3f..77ffc46e6a69 100644 --- a/test/CodeGen/ARM/divmod-eabi.ll +++ b/test/CodeGen/ARM/divmod-eabi.ll @@ -16,17 +16,15 @@ ; RUN: llc -mtriple armv7-linux-gnueabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI ; RUN: llc -mtriple armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=EABI ; RUN: llc -mtriple armv7-linux-musleabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI -; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT -; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0 -; FIXME: long-term, we will use "-apple-macho" and won't need this exception: -; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT -; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0 +; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN +; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=DARWIN-O0 ; RUN: llc -mtriple thumbv7-windows %s -o - | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-DEFAULT ; RUN: llc -mtriple thumbv7-windows %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-O0 define signext i16 @f16(i16 signext %a, i16 signext %b) { ; EABI-LABEL: f16: ; DARWIN-LABEL: f16: +; DARWIN-O0-LABEL: f16: ; WINDOWS-LABEL: f16: entry: %conv = sext i16 %a to i32 @@ -36,11 +34,9 @@ entry: ; EABI: __aeabi_idivmod ; EABI: mov [[div:r[0-9]+]], r0 ; EABI: mov [[rem:r[0-9]+]], r1 -; DARWIN: ___divsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __modsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: __divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: __rt_sdiv ; WINDOWS-DEFAULT: add [[sum:r[0-9]+]], r1 @@ -48,16 +44,13 @@ entry: %rem8 = srem i32 %conv1, %conv ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div %add13 = add nsw i32 %add, %rem8 %conv14 = trunc i32 %add13 to i16 ; EABI: add r0{{.*}}r1 ; EABI: sxth r0, r0 -; DARWIN-DEFAULT: add [[res:r[0-9]+]], [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 -; DARWIN: sxth r0, [[res]] ; WINDOWS-DEFAULT: adds [[sum1:r[0-9]+]], [[sum]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], ; WINDOWS-O0: add [[sum1:r[0-9]+]], r1 @@ -68,6 +61,7 @@ entry: define i32 @f32(i32 %a, i32 %b) { ; EABI-LABEL: f32: ; DARWIN-LABEL: f32: +; DARWIN-O0-LABEL: f32: ; WINDOWS-LABEL: f32: entry: %div = sdiv i32 %a, %b @@ -75,11 +69,9 @@ entry: ; EABI: __aeabi_idivmod ; EABI: mov [[div:r[0-9]+]], r0 ; EABI: mov [[rem:r[0-9]+]], r1 -; DARWIN: ___divsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __modsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv @@ -87,13 +79,11 @@ entry: %rem1 = srem i32 %b, %a ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div %add2 = add nsw i32 %add, %rem1 ; EABI: add r0{{.*}}r1 -; DARWIN-DEFAULT: add r0, [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 ; WINDOWS-DEFAULT: adds r0, [[div]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]] ; WINDOWS-O0: add [[sum]], r1 @@ -103,16 +93,15 @@ entry: define i32 @uf(i32 %a, i32 %b) { ; EABI-LABEL: uf: ; DARWIN-LABEL: uf: +; DARWIN-O0-LABEL: uf: ; WINDOWS-LABEL: uf: entry: %div = udiv i32 %a, %b %rem = urem i32 %a, %b ; EABI: __aeabi_uidivmod -; DARWIN: ___udivsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __umodsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: __udivmodsi4 +; DARWIN-O0: __udivsi3 +; DARWIN-O0: __umodsi3 ; WINDOWS: __rt_udiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_udiv @@ -120,13 +109,11 @@ entry: %rem1 = urem i32 %b, %a ; EABI: __aeabi_uidivmod ; DARWIN: __umodsi3 +; DARWIN-O0: __umodsi3 ; WINDOWS: __rt_udiv %add = add nuw i32 %rem, %div %add2 = add nuw i32 %add, %rem1 ; EABI: add r0{{.*}}r1 -; DARWIN-DEFAULT: add r0, [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 ; WINDOWS-DEFAULT: adds [[sum:r[0-9]+]], [[div]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], ; WINDOWS-O0: add [[sum]], r1 @@ -136,6 +123,7 @@ entry: define i64 @longf(i64 %a, i64 %b) { ; EABI-LABEL: longf: ; DARWIN-LABEL: longf: +; DARWIN-O0-LABEL: longf: ; WINDOWS-LABEL: longf: entry: %div = sdiv i64 %a, %b @@ -148,6 +136,8 @@ entry: ; DARWIN: mov [[div1:r[0-9]+]], r0 ; DARWIN: mov [[div2:r[0-9]+]], r1 ; DARWIN: __moddi3 +; DARWIN-O0: __divdi3 +; DARWIN-O0: __moddi3 ; WINDOWS: __rt_sdiv64 %add = add nsw i64 %rem, %div ; DARWIN: adds r0{{.*}}[[div1]] @@ -160,20 +150,19 @@ entry: define i16 @shortf(i16 %a, i16 %b) { ; EABI-LABEL: shortf: ; DARWIN-LABEL: shortf: +; DARWIN-O0-LABEL: shortf: ; WINDOWS-LABEL: shortf: entry: %div = sdiv i16 %a, %b %rem = srem i16 %a, %b ; EABI: __aeabi_idivmod -; DARWIN: ___divsi3 -; DARWIN: mov [[div1:r[0-9]+]], r0 -; DARWIN: __modsi3 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divmodsi4 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv %add = add nsw i16 %rem, %div ; EABI: add r0, r1 -; DARWIN: add r0{{.*}}[[div1]] ; WINDOWS: adds r0, r1, [[div]] ret i16 %add } @@ -181,20 +170,20 @@ entry: define i32 @g1(i32 %a, i32 %b) { ; EABI-LABEL: g1: ; DARWIN-LABEL: g1: +; DARWIN-O0-LABEL: g1: ; WINDOWS-LABEL: g1: entry: %div = sdiv i32 %a, %b %rem = srem i32 %a, %b ; EABI: __aeabi_idivmod -; DARWIN: ___divsi3 -; DARWIN: mov [[sum:r[0-9]+]], r0 -; DARWIN: __modsi3 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div ; EABI: add r0{{.*}}r1 -; DARWIN: add r0{{.*}}[[sum]] ; WINDOWS: adds r0, r1, [[div]] ret i32 %add } @@ -203,11 +192,13 @@ entry: define i32 @g2(i32 %a, i32 %b) { ; EABI-LABEL: g2: ; DARWIN-LABEL: g2: +; DARWIN-O0-LABEL: g2: ; WINDOWS-LABEL: g2: entry: %rem = srem i32 %a, %b ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ret i32 %rem ; EABI: mov r0, r1 @@ -217,6 +208,7 @@ entry: define i32 @g3(i32 %a, i32 %b) { ; EABI-LABEL: g3: ; DARWIN-LABEL: g3: +; DARWIN-O0-LABEL: g3: ; WINDOWS-LABEL: g3: entry: %rem = srem i32 %a, %b @@ -224,11 +216,13 @@ entry: ; EABI: mov [[mod:r[0-9]+]], r1 ; DARWIN: __modsi3 ; DARWIN: mov [[sum:r[0-9]+]], r0 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[rem:r[0-9]+]], r1 %rem1 = srem i32 %b, %rem ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem1, %rem ; EABI: add r0, r1, [[mod]] @@ -240,6 +234,7 @@ entry: define i32 @g4(i32 %a, i32 %b) { ; EABI-LABEL: g4: ; DARWIN-LABEL: g4: +; DARWIN-O0-LABEL: g4: ; WINDOWS-LABEL: g4: entry: %div = sdiv i32 %a, %b @@ -247,11 +242,13 @@ entry: ; EABI: mov [[div:r[0-9]+]], r0 ; DARWIN: ___divsi3 ; DARWIN: mov [[sum:r[0-9]+]], r0 +; DARWIN-O0: __divsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 %rem = srem i32 %b, %div ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div ; EABI: add r0, r1, [[div]] diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll index 9336d0c477d1..ffc1ed09cbf0 100644 --- a/test/CodeGen/ARM/divmod.ll +++ b/test/CodeGen/ARM/divmod.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8 ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift | FileCheck %s -check-prefix=SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-macho -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8 ; rdar://12481395 diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll new file mode 100644 index 000000000000..ca7ec1ab831c --- /dev/null +++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll @@ -0,0 +1,35 @@ +; RUN: llc -mcpu=atmega328p < %s -march=avr | FileCheck %s + +; CHECK-LABEL: loopy +define internal fastcc void @loopy() { + +; In this case, when we expand `Select8`/`Select16`, we should be +; replacing the existing MBB instead of adding a new one. +; +; https://github.com/avr-rust/rust/issues/49 + +; CHECK: LBB0_1: +; CHECK: LBB0_2: +; CHECK-NOT: LBB0_3: +start: + br label %bb7.preheader + +bb7.preheader: ; preds = %bb10, %start + %i = phi i8 [ 0, %start ], [ %j, %bb10 ] + %j = phi i8 [ 1, %start ], [ %next, %bb10 ] + br label %bb10 + +bb4: ; preds = %bb10 + ret void + +bb10: ; preds = %bb7.preheader + tail call fastcc void @observe(i8 %i, i8 1) + %0 = icmp ult i8 %j, 20 + %1 = zext i1 %0 to i8 + %next = add i8 %j, %1 + br i1 %0, label %bb7.preheader, label %bb4 + +} + +declare void @observe(i8, i8); + diff --git a/test/CodeGen/Generic/expand-experimental-reductions.ll b/test/CodeGen/Generic/expand-experimental-reductions.ll new file mode 100644 index 000000000000..ef813fa7205b --- /dev/null +++ b/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -expand-reductions -S | FileCheck %s +; Tests without a target which should expand all reductions +declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) + +declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>) + +declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>) + + +define i64 @add_i64(<2 x i64> %vec) { +; CHECK-LABEL: @add_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @mul_i64(<2 x i64> %vec) { +; CHECK-LABEL: @mul_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @and_i64(<2 x i64> %vec) { +; CHECK-LABEL: @and_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @or_i64(<2 x i64> %vec) { +; CHECK-LABEL: @or_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @xor_i64(<2 x i64> %vec) { +; CHECK-LABEL: @xor_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define float @fadd_f32(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fadd_f32_strict(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32_strict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) +; CHECK-NEXT: ret float [[R]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fmul_f32(<4 x float> %vec) { +; CHECK-LABEL: @fmul_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define i64 @smax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @smin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define double @fmax_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmax_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec) + ret double %r +} + +define double @fmin_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmin_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec) + ret double %r +} diff --git a/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/test/CodeGen/Hexagon/regalloc-bad-undef.mir index d8fbb92b0d50..a541e766f593 100644 --- a/test/CodeGen/Hexagon/regalloc-bad-undef.mir +++ b/test/CodeGen/Hexagon/regalloc-bad-undef.mir @@ -161,17 +161,17 @@ body: | bb.1.for.body: successors: %bb.3.for.end, %bb.2.if.end82 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 undef %29.isub_lo = COPY killed %r0 %29.isub_hi = S2_asr_i_r %29.isub_lo, 31 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 %32.isub_lo = COPY killed %r0 %7 = S2_extractup %32, 22, 9 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 undef %43.isub_lo = COPY killed %r0 @@ -179,7 +179,7 @@ body: | %16 = S2_extractup %43, 6, 25 %18 = A2_tfrpi -1 %18 = S2_asl_r_p_acc %18, %47, %16.isub_lo - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 %22 = S2_asl_r_p %18, %8.isub_lo diff --git a/test/CodeGen/Lanai/masking_setccs.ll b/test/CodeGen/Lanai/masking_setccs.ll new file mode 100644 index 000000000000..48136fd42574 --- /dev/null +++ b/test/CodeGen/Lanai/masking_setccs.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s | FileCheck %s + +; Test that unnecessary masking with 0x1 is not inserted. + +target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64" +target triple = "lanai" + +; CHECK-LABEL: masking: +; CHECK-NOT: mov 1 +define i32 @masking(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) { +entry: + %cmp = icmp ne i32 %a, 0 + %cmp1 = icmp ult i32 %a, %b + %or.cond = and i1 %cmp, %cmp1 + br i1 %or.cond, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp ne i32 %b, 0 + %cmp4 = icmp ult i32 %b, %c + %or.cond29 = and i1 %cmp2, %cmp4 + br i1 %or.cond29, label %return, label %if.end6 + +if.end6: ; preds = %if.end + %cmp7 = icmp ne i32 %c, 0 + %cmp9 = icmp ult i32 %c, %d + %or.cond30 = and i1 %cmp7, %cmp9 + br i1 %or.cond30, label %return, label %if.end11 + +if.end11: ; preds = %if.end6 + %cmp12 = icmp ne i32 %d, 0 + %cmp14 = icmp ult i32 %d, %a + %or.cond31 = and i1 %cmp12, %cmp14 + %b. = select i1 %or.cond31, i32 %b, i32 21 + ret i32 %b. + +return: ; preds = %if.end6, %if.end, %entry + %retval.0 = phi i32 [ %c, %entry ], [ %d, %if.end ], [ %a, %if.end6 ] + ret i32 %retval.0 +} + +; CHECK-LABEL: notnot: +; CHECK-NOT: mov 1 +define i32 @notnot(i32 %x) { +entry: + %tobool = icmp ne i32 %x, 0 + %lnot.ext = zext i1 %tobool to i32 + ret i32 %lnot.ext +} diff --git a/test/CodeGen/Lanai/peephole-compare.mir b/test/CodeGen/Lanai/peephole-compare.mir index 5056a05ed1f6..51133b5e58e3 100644 --- a/test/CodeGen/Lanai/peephole-compare.mir +++ b/test/CodeGen/Lanai/peephole-compare.mir @@ -644,7 +644,7 @@ body: | bb.1.if.then: successors: %bb.2.while.body - ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp @@ -663,7 +663,7 @@ body: | bb.4.if.then4: successors: %bb.5.while.body6 - ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir new file mode 100644 index 000000000000..96801f5b0a37 --- /dev/null +++ b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir @@ -0,0 +1,24 @@ +# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s +--- +name: foo +body: | + bb.0: + B %bb.2 + + bb.1: + BX_RET 14, 0 + + bb.2: + Bcc %bb.1, 1, %cpsr + + bb.3: + B %bb.1 + +... + +# We should get a single block containing the BX_RET, with no successors at all + +# CHECK: body: +# CHECK-NEXT: bb.0: +# CHECK-NEXT: BX_RET + diff --git a/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir new file mode 100644 index 000000000000..5a1583f7a9be --- /dev/null +++ b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir @@ -0,0 +1,64 @@ +# RUN: llc -mtriple=arm-apple-ios -o - %s -run-pass if-converter | FileCheck %s +--- +name: f1 +body: | + bb.0: + successors: %bb.1 + + B %bb.1 + + bb.1: + successors: %bb.2, %bb.4 + + Bcc %bb.4, 1, %cpsr + + bb.2: + successors: %bb.3, %bb.5 + + Bcc %bb.5, 1, %cpsr + + bb.3: + successors: %bb.5 + + B %bb.5 + + bb.4: + successors: + + bb.5: + successors: %bb.1, %bb.6 + + Bcc %bb.1, 1, %cpsr + + bb.6: + BX_RET 14, _ + +... + +# IfConversion.cpp/canFallThroughTo thought there was a fallthrough from +# bb.4 to bb5 even if the successor list was empty. +# bb.4 is empty, so it surely looks like it can fallthrough, but this is what +# happens for a bb just containing an "unreachable". + +#CHECK: body: | +#CHECK: bb.0: +#CHECK: successors: %bb.1 + +#CHECK: bb.1: +#CHECK: successors: %bb.3({{.*}}), %bb.2 + +# The original brr_cond from bb.1, jumping to the empty bb +#CHECK: Bcc %bb.2 +#CHECK: B %bb.3 + +# Empty bb.2, originally containing "unreachable" and thus has no successors +#CHECK: bb.2: +#CHECK-NOT: successors + +#CHECK: bb.3: +#CHECK: successors: %bb.1 + +# Conditional BX_RET and then loop back to bb.1 +#CHECK: BX_RET 0 +#CHECK: B %bb.1 + diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir index 2d5347e5d30d..14bb5db5a51d 100644 --- a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir +++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir @@ -60,7 +60,7 @@ body: | liveins: %eax MOV32mr %stack.0.tmp, 1, _, 0, _, killed %eax - ADJCALLSTACKDOWN64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp %rsi = LEA64r %stack.0.tmp, 1, _, 0, _ %edi = MOV32r0 implicit-def dead %eflags CALL64pcrel32 @doSomething, csr_64, implicit %rsp, implicit %edi, implicit %rsi, implicit-def %rsp, implicit-def %eax diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll new file mode 100644 index 000000000000..b23f1ad37d81 --- /dev/null +++ b/test/CodeGen/MSP430/hwmult16.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_hw + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_hw + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll new file mode 100644 index 000000000000..6ffeb9698862 --- /dev/null +++ b/test/CodeGen/MSP430/hwmult32.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_hw32 + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_hw32 + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll new file mode 100644 index 000000000000..51ca4be4a654 --- /dev/null +++ b/test/CodeGen/MSP430/hwmultf5.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_f5hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_f5hw + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_f5hw + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll index 5ccdbb701db1..b4366251698b 100644 --- a/test/CodeGen/MSP430/jumptable.ll +++ b/test/CodeGen/MSP430/jumptable.ll @@ -12,7 +12,7 @@ entry: store i16 %i, i16* %i.addr, align 2 %0 = load i16, i16* %i.addr, align 2 ; CHECK: mov.w #2, r13 -; CHECK: call #__mulhi3hw_noint +; CHECK: call #__mspabi_mpyi ; CHECK: br .LJTI0_0(r12) switch i16 %0, label %sw.default [ i16 0, label %sw.bb diff --git a/test/CodeGen/MSP430/libcalls.ll b/test/CodeGen/MSP430/libcalls.ll new file mode 100644 index 000000000000..950ed6c17e2c --- /dev/null +++ b/test/CodeGen/MSP430/libcalls.ll @@ -0,0 +1,595 @@ +; RUN: llc -O0 < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_double = global double 123.0, align 8 +@g_float = global float 123.0, align 8 +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define float @d2f() #0 { +entry: +; CHECK: d2f: + +; CHECK: call #__mspabi_cvtdf + %0 = load volatile double, double* @g_double, align 8 + %1 = fptrunc double %0 to float + + ret float %1 +} + +define double @f2d() #0 { +entry: +; CHECK: f2d: + +; CHECK: call #__mspabi_cvtfd + %0 = load volatile float, float* @g_float, align 8 + %1 = fpext float %0 to double + + ret double %1 +} + +define i32 @d2l() #0 { +entry: +; CHECK: d2l: + +; CHECK: call #__mspabi_fixdli + %0 = load volatile double, double* @g_double, align 8 + %1 = fptosi double %0 to i32 + + ret i32 %1 +} + +define i64 @d2ll() #0 { +entry: +; CHECK: d2ll: + +; CHECK: call #__mspabi_fixdlli + %0 = load volatile double, double* @g_double, align 8 + %1 = fptosi double %0 to i64 + + ret i64 %1 +} + +define i32 @d2ul() #0 { +entry: +; CHECK: d2ul: + +; CHECK: call #__mspabi_fixdul + %0 = load volatile double, double* @g_double, align 8 + %1 = fptoui double %0 to i32 + + ret i32 %1 +} + +define i64 @d2ull() #0 { +entry: +; CHECK: d2ull: + +; CHECK: call #__mspabi_fixdull + %0 = load volatile double, double* @g_double, align 8 + %1 = fptoui double %0 to i64 + + ret i64 %1 +} + +define i32 @f2l() #0 { +entry: +; CHECK: f2l: + +; CHECK: call #__mspabi_fixfli + %0 = load volatile float, float* @g_float, align 8 + %1 = fptosi float %0 to i32 + + ret i32 %1 +} + +define i64 @f2ll() #0 { +entry: +; CHECK: f2ll: + +; CHECK: call #__mspabi_fixflli + %0 = load volatile float, float* @g_float, align 8 + %1 = fptosi float %0 to i64 + + ret i64 %1 +} + +define i32 @f2ul() #0 { +entry: +; CHECK: f2ul: + +; CHECK: call #__mspabi_fixful + %0 = load volatile float, float* @g_float, align 8 + %1 = fptoui float %0 to i32 + + ret i32 %1 +} + +define i64 @f2ull() #0 { +entry: +; CHECK: f2ull: + +; CHECK: call #__mspabi_fixfull + %0 = load volatile float, float* @g_float, align 8 + %1 = fptoui float %0 to i64 + + ret i64 %1 +} + +define double @l2d() #0 { +entry: +; CHECK: l2d: + +; CHECK: call #__mspabi_fltlid + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sitofp i32 %0 to double + + ret double %1 +} + +define double @ll2d() #0 { +entry: +; CHECK: ll2d: + +; CHECK: call #__mspabi_fltllid + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sitofp i64 %0 to double + + ret double %1 +} + +define double @ul2d() #0 { +entry: +; CHECK: ul2d: + +; CHECK: call #__mspabi_fltuld + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = uitofp i32 %0 to double + + ret double %1 +} + +define double @ull2d() #0 { +entry: +; CHECK: ull2d: + +; CHECK: call #__mspabi_fltulld + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = uitofp i64 %0 to double + + ret double %1 +} + +define float @l2f() #0 { +entry: +; CHECK: l2f: + +; CHECK: call #__mspabi_fltlif + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sitofp i32 %0 to float + + ret float %1 +} + +define float @ll2f() #0 { +entry: +; CHECK: ll2f: + +; CHECK: call #__mspabi_fltllif + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sitofp i64 %0 to float + + ret float %1 +} + +define float @ul2f() #0 { +entry: +; CHECK: ul2f: + +; CHECK: call #__mspabi_fltulf + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = uitofp i32 %0 to float + + ret float %1 +} + +define float @ull2f() #0 { +entry: +; CHECK: ull2f: + +; CHECK: call #__mspabi_fltullf + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = uitofp i64 %0 to float + + ret float %1 +} + +define i1 @cmpd_oeq() #0 { +entry: +; CHECK: cmpd_oeq: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp oeq double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_une() #0 { +entry: +; CHECK: cmpd_une: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp une double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_oge() #0 { +entry: +; CHECK: cmpd_oge: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp oge double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_olt() #0 { +entry: +; CHECK: cmpd_olt: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp olt double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_ole() #0 { +entry: +; CHECK: cmpd_ole: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp ole double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_ogt() #0 { +entry: +; CHECK: cmpd_ogt: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp ogt double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_oeq() #0 { +entry: +; CHECK: cmpf_oeq: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp oeq float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_une() #0 { +entry: +; CHECK: cmpf_une: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp une float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_oge() #0 { +entry: +; CHECK: cmpf_oge: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp oge float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_olt() #0 { +entry: +; CHECK: cmpf_olt: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp olt float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_ole() #0 { +entry: +; CHECK: cmpf_ole: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp ole float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_ogt() #0 { +entry: +; CHECK: cmpf_ogt: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp ogt float %0, 123.0 + + ret i1 %1 +} + +define double @addd() #0 { +entry: +; CHECK: addd: + +; CHECK: call #__mspabi_addd + %0 = load volatile double, double* @g_double, align 8 + %1 = fadd double %0, 123.0 + + ret double %1 +} + +define float @addf() #0 { +entry: +; CHECK: addf: + +; CHECK: call #__mspabi_addf + %0 = load volatile float, float* @g_float, align 8 + %1 = fadd float %0, 123.0 + + ret float %1 +} + +define double @divd() #0 { +entry: +; CHECK: divd: + +; CHECK: call #__mspabi_divd + %0 = load volatile double, double* @g_double, align 8 + %1 = fdiv double %0, 123.0 + + ret double %1 +} + +define float @divf() #0 { +entry: +; CHECK: divf: + +; CHECK: call #__mspabi_divf + %0 = load volatile float, float* @g_float, align 8 + %1 = fdiv float %0, 123.0 + + ret float %1 +} + +define double @mpyd() #0 { +entry: +; CHECK: mpyd: + +; CHECK: call #__mspabi_mpyd + %0 = load volatile double, double* @g_double, align 8 + %1 = fmul double %0, 123.0 + + ret double %1 +} + +define float @mpyf() #0 { +entry: +; CHECK: mpyf: + +; CHECK: call #__mspabi_mpyf + %0 = load volatile float, float* @g_float, align 8 + %1 = fmul float %0, 123.0 + + ret float %1 +} + +define double @subd() #0 { +entry: +; CHECK: subd: + +; CHECK: call #__mspabi_subd + %0 = load volatile double, double* @g_double, align 8 + %1 = fsub double %0, %0 + + ret double %1 +} + +define float @subf() #0 { +entry: +; CHECK: subf: + +; CHECK: call #__mspabi_subf + %0 = load volatile float, float* @g_float, align 8 + %1 = fsub float %0, %0 + + ret float %1 +} + +define i16 @divi() #0 { +entry: +; CHECK: divi: + +; CHECK: call #__mspabi_divi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = sdiv i16 %0, %0 + + ret i16 %1 +} + +define i32 @divli() #0 { +entry: +; CHECK: divli: + +; CHECK: call #__mspabi_divli + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sdiv i32 %0, %0 + + ret i32 %1 +} + +define i64 @divlli() #0 { +entry: +; CHECK: divlli: + +; CHECK: call #__mspabi_divlli + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sdiv i64 %0, %0 + + ret i64 %1 +} + +define i16 @divu() #0 { +entry: +; CHECK: divu: + +; CHECK: call #__mspabi_divu + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = udiv i16 %0, %0 + + ret i16 %1 +} + +define i32 @divul() #0 { +entry: +; CHECK: divul: + +; CHECK: call #__mspabi_divul + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = udiv i32 %0, %0 + + ret i32 %1 +} + +define i64 @divull() #0 { +entry: +; CHECK: divull: + +; CHECK: call #__mspabi_divull + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = udiv i64 %0, %0 + + ret i64 %1 +} + +define i16 @remi() #0 { +entry: +; CHECK: remi: + +; CHECK: call #__mspabi_remi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = srem i16 %0, %0 + + ret i16 %1 +} + +define i32 @remli() #0 { +entry: +; CHECK: remli: + +; CHECK: call #__mspabi_remli + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = srem i32 %0, %0 + + ret i32 %1 +} + +define i64 @remlli() #0 { +entry: +; CHECK: remlli: + +; CHECK: call #__mspabi_remlli + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = srem i64 %0, %0 + + ret i64 %1 +} + +define i16 @remu() #0 { +entry: +; CHECK: remu: + +; CHECK: call #__mspabi_remu + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = urem i16 %0, %0 + + ret i16 %1 +} + +define i32 @remul() #0 { +entry: +; CHECK: remul: + +; CHECK: call #__mspabi_remul + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = urem i32 %0, %0 + + ret i32 %1 +} + +define i64 @remull() #0 { +entry: +; CHECK: remull: + +; CHECK: call #__mspabi_remull + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = urem i64 %0, %0 + + ret i64 %1 +} + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll b/test/CodeGen/MSP430/promote-i8-mul.ll index dce9d25ca87a..0e05e3978b1e 100644 --- a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll +++ b/test/CodeGen/MSP430/promote-i8-mul.ll @@ -8,7 +8,7 @@ target triple = "msp430-elf" define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone { entry: ; CHECK-LABEL: foo: -; CHECK: call #__mulqi3 +; CHECK: call #__mspabi_mpyi %mul = mul i8 %_si2, %_si1 ; <i8> [#uses=1] ret i8 %mul } diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll index 076c44684579..6d747f09d8a7 100644 --- a/test/CodeGen/NVPTX/bug17709.ll +++ b/test/CodeGen/NVPTX/bug17709.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -; ModuleID = '__kernelgen_main_module' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) { -entry: - ;unreachable - %t0 = insertvalue {double, double} undef, double 1.0, 0 - %t1 = insertvalue {double, double} %t0, double 1.0, 1 - ret { double, double } %t1 -} - -%struct.descriptor_dimension.0.52 = type { i64, i64, i64 } -%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } -%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } -@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096 - -; CHECK: .visible .entry __kernelgen_main -define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) { -entry: - %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8) - ret void -} - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+entry:
+ ;unreachable
+ %t0 = insertvalue {double, double} undef, double 1.0, 0
+ %t1 = insertvalue {double, double} %t0, double 1.0, 1
+ ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
+entry:
+ %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
+ ret void
+}
+
diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll index 005958bd938a..7aa29fe811dd 100644 --- a/test/CodeGen/NVPTX/ctlz.ll +++ b/test/CodeGen/NVPTX/ctlz.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll index b961d4d27bdd..69a4f879a8d8 100644 --- a/test/CodeGen/NVPTX/ctpop.ll +++ b/test/CodeGen/NVPTX/ctpop.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll index 124ba9d1e9a7..0bfe0139bcdf 100644 --- a/test/CodeGen/NVPTX/cttz.ll +++ b/test/CodeGen/NVPTX/cttz.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll index 3d4140820794..08a2ee14e8bd 100644 --- a/test/CodeGen/NVPTX/f16-instructions.ll +++ b/test/CodeGen/NVPTX/f16-instructions.ll @@ -1,1078 +1,1079 @@ -; ## Full FP16 support enabled by default. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s -; ## FP16 support explicitly disabled. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s -; ## FP16 is not supported by hardware. -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ -; RUN: -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s - -target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: test_ret_const( -; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_ret_const() #0 { - ret half 1.0 -} - -; CHECK-LABEL: test_fadd( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1]; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd(half %a, half %b) #0 { - %r = fadd half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fadd_v1f16( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1]; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { - %r = fadd <1 x half> %a, %b - ret <1 x half> %r -} - -; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0]; -; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd_imm_0(half %b) #0 { - %r = fadd half 1.0, %b - ret half %r -} - -; CHECK-LABEL: test_fadd_imm_1( -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0]; -; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd_imm_1(half %a) #0 { - %r = fadd half %a, 1.0 - ret half %r -} - -; CHECK-LABEL: test_fsub( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1]; -; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fsub(half %a, half %b) #0 { - %r = fsub half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0]; -; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000 -; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; -; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fneg(half %a) #0 { - %r = fsub half 0.0, %a - ret half %r -} - -; CHECK-LABEL: test_fmul( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1]; -; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fmul(half %a, half %b) #0 { - %r = fmul half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]]; -; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]]; -; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fdiv(half %a, half %b) #0 { - %r = fdiv half %a, %b - ret half %r -} - -; CHECK-LABEL: test_frem( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1]; -; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]]; -; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]]; -; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]]; -; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]]; -; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]]; -; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_frem(half %a, half %b) #0 { - %r = frem half %a, %b - ret half %r -} - -; CHECK-LABEL: test_store( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0]; -; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1]; -; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]]; -; CHECK-NEXT: ret; -define void @test_store(half %a, half* %b) #0 { - store half %a, half* %b - ret void -} - -; CHECK-LABEL: test_load( -; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; -; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_load(half* %a) #0 { - %r = load half, half* %a - ret half %r -} - -; CHECK-LABEL: .visible .func test_halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] -; CHECK: ret -define void @test_halfp0a1(half * noalias readonly %from, half * %to) { - %1 = load half, half * %from , align 1 - store half %1, half * %to , align 1 - ret void -} - -declare half @test_callee(half %a, half %b) #0 - -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[A]]; -; CHECK-DAG: st.param.b16 [param1+0], [[B]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_call(half %a, half %b) #0 { - %r = call half @test_callee(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_call_flipped(half %a, half %b) #0 { - %r = call half @test_callee(half %b, half %a) - ret half %r -} - -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_tailcall_flipped(half %a, half %b) #0 { - %r = tail call half @test_callee(half %b, half %a) - ret half %r -} - -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select(half %a, half %b, i1 zeroext %c) #0 { - %r = select i1 %c, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] -; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { - %cc = fcmp une half %c, %d - %r = select i1 %cc, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2]; -; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] -; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { - %cc = fcmp une half %c, %d - %r = select i1 %cc, float %a, float %b - ret float %r -} - -; CHECK-LABEL: test_select_cc_f16_f32( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1]; -; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { - %cc = fcmp une float %c, %d - %r = select i1 %cc, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_fcmp_une( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_une(half %a, half %b) #0 { - %r = fcmp une half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ueq( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1]; -; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ueq(half %a, half %b) #0 { - %r = fcmp ueq half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ugt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1]; -; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ugt(half %a, half %b) #0 { - %r = fcmp ugt half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_uge( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1]; -; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_uge(half %a, half %b) #0 { - %r = fcmp uge half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ult( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1]; -; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ult(half %a, half %b) #0 { - %r = fcmp ult half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ule( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1]; -; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ule(half %a, half %b) #0 { - %r = fcmp ule half %a, %b - ret i1 %r -} - - -; CHECK-LABEL: test_fcmp_uno( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1]; -; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_uno(half %a, half %b) #0 { - %r = fcmp uno half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_one( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1]; -; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_one(half %a, half %b) #0 { - %r = fcmp one half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_oeq( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1]; -; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_oeq(half %a, half %b) #0 { - %r = fcmp oeq half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ogt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1]; -; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ogt(half %a, half %b) #0 { - %r = fcmp ogt half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_oge( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1]; -; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_oge(half %a, half %b) #0 { - %r = fcmp oge half %a, %b - ret i1 %r -} - -; XCHECK-LABEL: test_fcmp_olt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1]; -; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_olt(half %a, half %b) #0 { - %r = fcmp olt half %a, %b - ret i1 %r -} - -; XCHECK-LABEL: test_fcmp_ole( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1]; -; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ole(half %a, half %b) #0 { - %r = fcmp ole half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ord( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1]; -; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ord(half %a, half %b) #0 { - %r = fcmp ord half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_br_cc( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1]; -; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; -; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; -; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]]; -; CHECK: st.u32 [%[[C]]], -; CHECK: [[LABEL]]: -; CHECK: st.u32 [%[[D]]], -; CHECK: ret; -define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { - %c = fcmp uge half %a, %b - br i1 %c, label %then, label %else -then: - store i32 0, i32* %p1 - ret void -else: - store i32 0, i32* %p2 - ret void -} - -; CHECK-LABEL: test_phi( -; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0]; -; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]]; -; CHECK: [[LOOP:LBB[0-9_]+]]: -; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]]; -; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]]; -; CHECK: { -; CHECK: st.param.b64 [param0+0], %[[P1]]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_dummy -; CHECK: } -; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1; -; CHECK: @[[PRED]] bra [[LOOP]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_phi(half* %p1) #0 { -entry: - %a = load half, half* %p1 - br label %loop -loop: - %r = phi half [%a, %entry], [%b, %loop] - %b = load half, half* %p1 - %c = call i1 @test_dummy(half* %p1) - br i1 %c, label %loop, label %return -return: - ret half %r -} -declare i1 @test_dummy(half* %p1) #0 - -; CHECK-LABEL: test_fptosi_i32( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0]; -; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i32 @test_fptosi_i32(half %a) #0 { - %r = fptosi half %a to i32 - ret i32 %r -} - -; CHECK-LABEL: test_fptosi_i64( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0]; -; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK: ret; -define i64 @test_fptosi_i64(half %a) #0 { - %r = fptosi half %a to i64 - ret i64 %r -} - -; CHECK-LABEL: test_fptoui_i32( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0]; -; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i32 @test_fptoui_i32(half %a) #0 { - %r = fptoui half %a to i32 - ret i32 %r -} - -; CHECK-LABEL: test_fptoui_i64( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0]; -; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK: ret; -define i64 @test_fptoui_i64(half %a) #0 { - %r = fptoui half %a to i64 - ret i64 %r -} - -; CHECK-LABEL: test_uitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; -; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i32(i32 %a) #0 { - %r = uitofp i32 %a to half - ret half %r -} - -; CHECK-LABEL: test_uitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; -; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i64(i64 %a) #0 { - %r = uitofp i64 %a to half - ret half %r -} - -; CHECK-LABEL: test_sitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; -; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i32(i32 %a) #0 { - %r = sitofp i32 %a to half - ret half %r -} - -; CHECK-LABEL: test_sitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; -; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i64(i64 %a) #0 { - %r = sitofp i64 %a to half - ret half %r -} - -; CHECK-LABEL: test_uitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; -; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1]; -; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { - %c = uitofp i32 %a to half - %r = fadd half %b, %c - ret half %r -} - -; CHECK-LABEL: test_sitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; -; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1]; -; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; -; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; -; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { - %c = sitofp i32 %a to half - %r = fadd half %b, %c - ret half %r -} - -; CHECK-LABEL: test_fptrunc_float( -; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fptrunc_float(float %a) #0 { - %r = fptrunc float %a to half - ret half %r -} - -; CHECK-LABEL: test_fptrunc_double( -; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; -; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fptrunc_double(double %a) #0 { - %r = fptrunc double %a to half - ret half %r -} - -; CHECK-LABEL: test_fpext_float( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0]; -; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK: ret; -define float @test_fpext_float(half %a) #0 { - %r = fpext half %a to float - ret float %r -} - -; CHECK-LABEL: test_fpext_double( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0]; -; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; -; CHECK: st.param.f64 [func_retval0+0], [[R]]; -; CHECK: ret; -define double @test_fpext_double(half %a) #0 { - %r = fpext half %a to double - ret double %r -} - - -; CHECK-LABEL: test_bitcast_halftoi16( -; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0]; -; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]] -; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]] -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i16 @test_bitcast_halftoi16(half %a) #0 { - %r = bitcast half %a to i16 - ret i16 %r -} - -; CHECK-LABEL: test_bitcast_i16tohalf( -; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; -; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]] -; CHECK: st.param.b16 [func_retval0+0], [[AH]]; -; CHECK: ret; -define half @test_bitcast_i16tohalf(i16 %a) #0 { - %r = bitcast i16 %a to half - ret half %r -} - - -declare half @llvm.sqrt.f16(half %a) #0 -declare half @llvm.powi.f16(half %a, i32 %b) #0 -declare half @llvm.sin.f16(half %a) #0 -declare half @llvm.cos.f16(half %a) #0 -declare half @llvm.pow.f16(half %a, half %b) #0 -declare half @llvm.exp.f16(half %a) #0 -declare half @llvm.exp2.f16(half %a) #0 -declare half @llvm.log.f16(half %a) #0 -declare half @llvm.log10.f16(half %a) #0 -declare half @llvm.log2.f16(half %a) #0 -declare half @llvm.fma.f16(half %a, half %b, half %c) #0 -declare half @llvm.fabs.f16(half %a) #0 -declare half @llvm.minnum.f16(half %a, half %b) #0 -declare half @llvm.maxnum.f16(half %a, half %b) #0 -declare half @llvm.copysign.f16(half %a, half %b) #0 -declare half @llvm.floor.f16(half %a) #0 -declare half @llvm.ceil.f16(half %a) #0 -declare half @llvm.trunc.f16(half %a) #0 -declare half @llvm.rint.f16(half %a) #0 -declare half @llvm.nearbyint.f16(half %a) #0 -declare half @llvm.round.f16(half %a) #0 -declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 - -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sqrt(half %a) #0 { - %r = call half @llvm.sqrt.f16(half %a) - ret half %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_powi( -;define half @test_powi(half %a, i32 %b) #0 { -; %r = call half @llvm.powi.f16(half %a, i32 %b) -; ret half %r -;} - -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sin(half %a) #0 #1 { - %r = call half @llvm.sin.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_cos(half %a) #0 #1 { - %r = call half @llvm.cos.f16(half %a) - ret half %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_pow( -;define half @test_pow(half %a, half %b) #0 { -; %r = call half @llvm.pow.f16(half %a, half %b) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp( -;define half @test_exp(half %a) #0 { -; %r = call half @llvm.exp.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp2( -;define half @test_exp2(half %a) #0 { -; %r = call half @llvm.exp2.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log( -;define half @test_log(half %a) #0 { -; %r = call half @llvm.log.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log10( -;define half @test_log10(half %a) #0 { -; %r = call half @llvm.log10.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log2( -;define half @test_log2(half %a) #0 { -; %r = call half @llvm.log2.f16(half %a) -; ret half %r -;} - -; CHECK-LABEL: test_fma( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2]; -; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret -define half @test_fma(half %a, half %b, half %c) #0 { - %r = call half @llvm.fma.f16(half %a, half %b, half %c) - ret half %r -} - -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fabs(half %a) #0 { - %r = call half @llvm.fabs.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_minnum(half %a, half %b) #0 { - %r = call half @llvm.minnum.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_maxnum(half %a, half %b) #0 { - %r = call half @llvm.maxnum.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; -; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign(half %a, half %b) #0 { - %r = call half @llvm.copysign.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_copysign_f32( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; -; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; -; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; -; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]]; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign_f32(half %a, float %b) #0 { - %tb = fptrunc float %b to half - %r = call half @llvm.copysign.f16(half %a, half %tb) - ret half %r -} - -; CHECK-LABEL: test_copysign_f64( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; -; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; -; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808; -; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48; -; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]]; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign_f64(half %a, double %b) #0 { - %tb = fptrunc double %b to half - %r = call half @llvm.copysign.f16(half %a, half %tb) - ret half %r -} - -; CHECK-LABEL: test_copysign_extended( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0]; -; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; -; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]]; -; CHECK: st.param.f32 [func_retval0+0], [[XR]]; -; CHECK: ret; -define float @test_copysign_extended(half %a, half %b) #0 { - %r = call half @llvm.copysign.f16(half %a, half %b) - %xr = fpext half %r to float - ret float %xr -} - -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0]; -; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_floor(half %a) #0 { - %r = call half @llvm.floor.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0]; -; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_ceil(half %a) #0 { - %r = call half @llvm.ceil.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0]; -; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_trunc(half %a) #0 { - %r = call half @llvm.trunc.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_rint(half %a) #0 { - %r = call half @llvm.rint.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_nearbyint( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_nearbyint(half %a) #0 { - %r = call half @llvm.nearbyint.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_round( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_round(half %a) #0 { - %r = call half @llvm.round.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2]; -; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fmuladd(half %a, half %b, half %c) #0 { - %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) - ret half %r -} - -attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } +; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00;
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_ret_const() #0 {
+ ret half 1.0
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd(half %a, half %b) #0 {
+ %r = fadd half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fadd_v1f16(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
+ %r = fadd <1 x half> %a, %b
+ ret <1 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
+; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_0(half %b) #0 {
+ %r = fadd half 1.0, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
+; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_1(half %a) #0 {
+ %r = fadd half %a, 1.0
+ ret half %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fsub(half %a, half %b) #0 {
+ %r = fsub half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0];
+; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000
+; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fneg(half %a) #0 {
+ %r = fsub half 0.0, %a
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fmul(half %a, half %b) #0 {
+ %r = fmul half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]];
+; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fdiv(half %a, half %b) #0 {
+ %r = fdiv half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_frem(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1];
+; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]];
+; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
+; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]];
+; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]];
+; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_frem(half %a, half %b) #0 {
+ %r = frem half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_store(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0];
+; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1];
+; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]];
+; CHECK-NEXT: ret;
+define void @test_store(half %a, half* %b) #0 {
+ store half %a, half* %b
+ ret void
+}
+
+; CHECK-LABEL: test_load(
+; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0];
+; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_load(half* %a) #0 {
+ %r = load half, half* %a
+ ret half %r
+}
+
+; CHECK-LABEL: .visible .func test_halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
+; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8 [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]]
+; CHECK: ret
+define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
+ %1 = load half, half * %from , align 1
+ store half %1, half * %to , align 1
+ ret void
+}
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[A]];
+; CHECK-DAG: st.param.b16 [param1+0], [[B]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call(half %a, half %b) #0 {
+ %r = call half @test_callee(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[B]];
+; CHECK-DAG: st.param.b16 [param1+0], [[A]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call_flipped(half %a, half %b) #0 {
+ %r = call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[B]];
+; CHECK-DAG: st.param.b16 [param1+0], [[A]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+ %r = tail call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
+; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+ %cc = fcmp une half %c, %d
+ %r = select i1 %cc, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
+; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+ %cc = fcmp une half %c, %d
+ %r = select i1 %cc, float %a, float %b
+ ret float %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
+; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
+; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+ %cc = fcmp une float %c, %d
+ %r = select i1 %cc, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+ %r = fcmp une half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+ %r = fcmp ueq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+ %r = fcmp ugt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+ %r = fcmp uge half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+ %r = fcmp ult half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+ %r = fcmp ule half %a, %b
+ ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+ %r = fcmp uno half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+ %r = fcmp one half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+ %r = fcmp oeq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+ %r = fcmp ogt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+ %r = fcmp oge half %a, %b
+ ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+ %r = fcmp olt half %a, %b
+ ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+ %r = fcmp ole half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+ %r = fcmp ord half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1];
+; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2];
+; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3];
+; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]];
+; CHECK: st.u32 [%[[C]]],
+; CHECK: [[LABEL]]:
+; CHECK: st.u32 [%[[D]]],
+; CHECK: ret;
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+ %c = fcmp uge half %a, %b
+ br i1 %c, label %then, label %else
+then:
+ store i32 0, i32* %p1
+ ret void
+else:
+ store i32 0, i32* %p2
+ ret void
+}
+
+; CHECK-LABEL: test_phi(
+; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0];
+; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]];
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
+; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]];
+; CHECK: {
+; CHECK: st.param.b64 [param0+0], %[[P1]];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_dummy
+; CHECK: }
+; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1;
+; CHECK: @[[PRED]] bra [[LOOP]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_phi(half* %p1) #0 {
+entry:
+ %a = load half, half* %p1
+ br label %loop
+loop:
+ %r = phi half [%a, %entry], [%b, %loop]
+ %b = load half, half* %p1
+ %c = call i1 @test_dummy(half* %p1)
+ br i1 %c, label %loop, label %return
+return:
+ ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i32 @test_fptosi_i32(half %a) #0 {
+ %r = fptosi half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i64 @test_fptosi_i64(half %a) #0 {
+ %r = fptosi half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
+; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i32 @test_fptoui_i32(half %a) #0 {
+ %r = fptoui half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
+; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i64 @test_fptoui_i64(half %a) #0 {
+ %r = fptoui half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
+; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i32(i32 %a) #0 {
+ %r = uitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64(
+; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
+; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i64(i64 %a) #0 {
+ %r = uitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
+; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i32(i32 %a) #0 {
+ %r = sitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64(
+; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
+; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i64(i64 %a) #0 {
+ %r = sitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i32_fadd(
+; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
+; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
+; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = uitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32_fadd(
+; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
+; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
+; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
+; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = sitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float(
+; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fptrunc_float(float %a) #0 {
+ %r = fptrunc float %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double(
+; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
+; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fptrunc_double(double %a) #0 {
+ %r = fptrunc double %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0];
+; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define float @test_fpext_float(half %a) #0 {
+ %r = fpext half %a to float
+ ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0];
+; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]];
+; CHECK: st.param.f64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define double @test_fpext_double(half %a) #0 {
+ %r = fpext half %a to double
+ ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16(
+; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
+; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]]
+; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]]
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+ %r = bitcast half %a to i16
+ ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf(
+; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
+; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]]
+; CHECK: st.param.b16 [func_retval0+0], [[AH]];
+; CHECK: ret;
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+ %r = bitcast i16 %a to half
+ ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sqrt(half %a) #0 {
+ %r = call half @llvm.sqrt.f16(half %a)
+ ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define half @test_powi(half %a, i32 %b) #0 {
+; %r = call half @llvm.powi.f16(half %a, i32 %b)
+; ret half %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sin(half %a) #0 #1 {
+ %r = call half @llvm.sin.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_cos(half %a) #0 #1 {
+ %r = call half @llvm.cos.f16(half %a)
+ ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define half @test_pow(half %a, half %b) #0 {
+; %r = call half @llvm.pow.f16(half %a, half %b)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define half @test_exp(half %a) #0 {
+; %r = call half @llvm.exp.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define half @test_exp2(half %a) #0 {
+; %r = call half @llvm.exp2.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define half @test_log(half %a) #0 {
+; %r = call half @llvm.log.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define half @test_log10(half %a) #0 {
+; %r = call half @llvm.log10.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define half @test_log2(half %a) #0 {
+; %r = call half @llvm.log2.f16(half %a)
+; ret half %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2];
+; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fabs(half %a) #0 {
+ %r = call half @llvm.fabs.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_minnum(half %a, half %b) #0 {
+ %r = call half @llvm.minnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_maxnum(half %a, half %b) #0 {
+ %r = call half @llvm.maxnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign(half %a, half %b) #0 {
+ %r = call half @llvm.copysign.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
+; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
+; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign_f32(half %a, float %b) #0 {
+ %tb = fptrunc float %b to half
+ %r = call half @llvm.copysign.f16(half %a, half %tb)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
+; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
+; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign_f64(half %a, double %b) #0 {
+ %tb = fptrunc double %b to half
+ %r = call half @llvm.copysign.f16(half %a, half %tb)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]];
+; CHECK: st.param.f32 [func_retval0+0], [[XR]];
+; CHECK: ret;
+define float @test_copysign_extended(half %a, half %b) #0 {
+ %r = call half @llvm.copysign.f16(half %a, half %b)
+ %xr = fpext half %r to float
+ ret float %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0];
+; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_floor(half %a) #0 {
+ %r = call half @llvm.floor.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0];
+; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_ceil(half %a) #0 {
+ %r = call half @llvm.ceil.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0];
+; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_trunc(half %a) #0 {
+ %r = call half @llvm.trunc.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_rint(half %a) #0 {
+ %r = call half @llvm.rint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_nearbyint(half %a) #0 {
+ %r = call half @llvm.nearbyint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_round(half %a) #0 {
+ %r = call half @llvm.round.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2];
+; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll index 33bb616d895c..5dc796ada37f 100644 --- a/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1,1426 +1,1427 @@ -; ## Full FP16 support enabled by default. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s -; ## FP16 support explicitly disabled. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s -; ## FP16 is not supported by hardware. -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ -; RUN: -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s - -target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: test_ret_const( -; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; -; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_ret_const() #0 { - ret <2 x half> <half 1.0, half 2.0> -} - -; CHECK-LABEL: test_extract_0( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0]; -; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_0(<2 x half> %a) #0 { - %e = extractelement <2 x half> %a, i32 0 - ret half %e -} - -; CHECK-LABEL: test_extract_1( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0]; -; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_1(<2 x half> %a) #0 { - %e = extractelement <2 x half> %a, i32 1 - ret half %e -} - -; CHECK-LABEL: test_extract_i( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0]; -; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; -; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]]; -; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { - %e = extractelement <2 x half> %a, i64 %idx - ret half %e -} - -; CHECK-LABEL: test_fadd( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1]; -; -; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { - %r = fadd <2 x half> %a, %b - ret <2 x half> %r -} - -; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; -; -; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { - %r = fadd <2 x half> <half 1.0, half 2.0>, %a - ret <2 x half> %r -} - -; CHECK-LABEL: test_fadd_imm_1( -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; -; -; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { - %r = fadd <2 x half> %a, <half 1.0, half 2.0> - ret <2 x half> %r -} - -; CHECK-LABEL: test_fsub( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0]; -; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1]; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { - %r = fsub <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; -; -; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; -; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fneg(<2 x half> %a) #0 { - %r = fsub <2 x half> <half 0.0, half 0.0>, %a - ret <2 x half> %r -} - -; CHECK-LABEL: test_fmul( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1]; -; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { - %r = fmul <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]; -; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { - %r = fdiv <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_frem( -; -- Load two 16x2 inputs and split them into f16 elements -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1]; -; -- Split into elements -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; -- promote to f32. -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; -; -- frem(a[0],b[0]). -; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; -; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; -; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]]; -; -- frem(a[1],b[1]). -; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; -; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; -; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]]; -; -- convert back to f16. -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; -- merge into f16x2 and return it. -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { - %r = frem <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: .func test_ldst_v2f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; -; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]] -; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]]; -; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]}; -; CHECK: ret; -define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) { - %t1 = load <2 x half>, <2 x half>* %a - store <2 x half> %t1, <2 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v3f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; -; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair -; number of bitshifting instructions that may change at llvm's whim. -; So we only verify that we only issue correct number of writes using -; correct offset, but not the values we write. -; CHECK-DAG: ld.u64 -; CHECK-DAG: st.u32 [%[[B]]], -; CHECK-DAG: st.b16 [%[[B]]+4], -; CHECK: ret; -define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) { - %t1 = load <3 x half>, <3 x half>* %a - store <3 x half> %t1, <3 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v4f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; -; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; -define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) { - %t1 = load <4 x half>, <4 x half>* %a - store <4 x half> %t1, <4 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v8f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; -; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; -define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) { - %t1 = load <8 x half>, <8 x half>* %a - store <8 x half> %t1, <8 x half>* %b, align 16 - ret void -} - -declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 - -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[A]]; -; CHECK-DAG: st.param.b32 [param1+0], [[B]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { - %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1]; -; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { - %r = select i1 %c, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { - %cc = fcmp une <2 x half> %c, %d - %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, - <2 x half> %c, <2 x half> %d) #0 { - %cc = fcmp une <2 x half> %c, %d - %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b - ret <2 x float> %r -} - -; CHECK-LABEL: test_select_cc_f16_f32( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1]; -; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, - <2 x float> %c, <2 x float> %d) #0 { - %cc = fcmp une <2 x float> %c, %d - %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fcmp_une( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1]; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp une <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ueq( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1]; -; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ueq <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ugt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1]; -; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ugt <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_uge( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1]; -; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp uge <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ult( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1]; -; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ult <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ule( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1]; -; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ule <2 x half> %a, %b - ret <2 x i1> %r -} - - -; CHECK-LABEL: test_fcmp_uno( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1]; -; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp uno <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_one( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1]; -; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp one <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_oeq( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1]; -; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp oeq <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ogt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1]; -; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ogt <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_oge( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1]; -; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp oge <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_olt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1]; -; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp olt <2 x half> %a, %b - ret <2 x i1> %r -} - -; XCHECK-LABEL: test_fcmp_ole( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1]; -; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ole <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ord( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1]; -; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ord <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fptosi_i32( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { - %r = fptosi <2 x half> %a to <2 x i32> - ret <2 x i32> %r -} - -; CHECK-LABEL: test_fptosi_i64( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { - %r = fptosi <2 x half> %a to <2 x i64> - ret <2 x i64> %r -} - -; CHECK-LABEL: test_fptoui_2xi32( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { - %r = fptoui <2 x half> %a to <2 x i32> - ret <2 x i32> %r -} - -; CHECK-LABEL: test_fptoui_2xi64( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { - %r = fptoui <2 x half> %a to <2 x i64> - ret <2 x i64> %r -} - -; CHECK-LABEL: test_uitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { - %r = uitofp <2 x i32> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_uitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { - %r = uitofp <2 x i64> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { - %r = sitofp <2 x i32> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { - %r = sitofp <2 x i64> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_uitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]]; - -; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { - %c = uitofp <2 x i32> %a to <2 x half> - %r = fadd <2 x half> %b, %c - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]]; -; -; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { - %c = sitofp <2 x i32> %a to <2 x half> - %r = fadd <2 x half> %b, %c - ret <2 x half> %r -} - -; CHECK-LABEL: test_fptrunc_2xfloat( -; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { - %r = fptrunc <2 x float> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_fptrunc_2xdouble( -; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; -; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { - %r = fptrunc <2 x double> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_fpext_2xfloat( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; -define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { - %r = fpext <2 x half> %a to <2 x float> - ret <2 x float> %r -} - -; CHECK-LABEL: test_fpext_2xdouble( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; -define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { - %r = fpext <2 x half> %a to <2 x double> - ret <2 x double> %r -} - - -; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] -; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 -; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] -; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { - %r = bitcast <2 x half> %a to <2 x i16> - ret <2 x i16> %r -} - -; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( -; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; -; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; -; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; -; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { - %r = bitcast <2 x i16> %a to <2 x half> - ret <2 x half> %r -} - - -declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0 -declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 -declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 - -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sqrt(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) - ret <2 x half> %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_powi( -;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 { -; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) -; ret <2 x half> %r -;} - -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sin(<2 x half> %a) #0 #1 { - %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_cos(<2 x half> %a) #0 #1 { - %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) - ret <2 x half> %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_pow( -;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 { -; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp( -;define <2 x half> @test_exp(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp2( -;define <2 x half> @test_exp2(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log( -;define <2 x half> @test_log(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log10( -;define <2 x half> @test_log10(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log2( -;define <2 x half> @test_log2(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -; CHECK-LABEL: test_fma( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} - -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret -define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { - %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) - ret <2 x half> %r -} - -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fabs(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_f32( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; -; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; -; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; -; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { - %tb = fptrunc <2 x float> %b to <2 x half> - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_f64( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; -; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; -; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; -; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; -; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { - %tb = fptrunc <2 x double> %b to <2 x half> - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_extended( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]] -; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]]; -; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]]; -; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; -; CHECK: ret; -define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) - %xr = fpext <2 x half> %r to <2 x float> - ret <2 x float> %xr -} - -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_floor(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_ceil(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_trunc(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_rint(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_nearbyint( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_nearbyint(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_round( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_round(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.round.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { - %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) - ret <2 x half> %r -} - -attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } +; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;
+; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_ret_const() #0 {
+ ret <2 x half> <half 1.0, half 2.0>
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0];
+; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_0(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 0
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0];
+; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_1(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 1
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_i(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
+; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0;
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
+; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+ %e = extractelement <2 x half> %a, i64 %idx
+ ret half %e
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1];
+;
+; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fadd <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+ %r = fadd <2 x half> <half 1.0, half 2.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+ %r = fadd <2 x half> %a, <half 1.0, half 2.0>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0];
+;
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fsub <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0];
+;
+; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0;
+; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fneg(<2 x half> %a) #0 {
+ %r = fsub <2 x half> <half 0.0, half 0.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fmul <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fdiv <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_frem(
+; -- Load two 16x2 inputs and split them into f16 elements
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1];
+; -- Split into elements
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; -- promote to f32.
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; -- frem(a[0],b[0]).
+; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
+; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
+; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
+; -- frem(a[1],b[1]).
+; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
+; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
+; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
+; -- convert back to f16.
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; -- merge into f16x2 and return it.
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+ %r = frem <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
+; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]]
+; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]};
+; CHECK: ret;
+define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
+ %t1 = load <2 x half>, <2 x half>* %a
+ store <2 x half> %t1, <2 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+; number of bitshifting instructions that may change at llvm's whim.
+; So we only verify that we only issue correct number of writes using
+; correct offset, but not the values we write.
+; CHECK-DAG: ld.u64
+; CHECK-DAG: st.u32 [%[[B]]],
+; CHECK-DAG: st.b16 [%[[B]]+4],
+; CHECK: ret;
+define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
+ %t1 = load <3 x half>, <3 x half>* %a
+ store <3 x half> %t1, <3 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v4f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
+; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
+ %t1 = load <4 x half>, <4 x half>* %a
+ store <4 x half> %t1, <4 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v8f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
+; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
+ %t1 = load <8 x half>, <8 x half>* %a
+ store <8 x half> %t1, <8 x half>* %b, align 16
+ ret void
+}
+
+declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[A]];
+; CHECK-DAG: st.param.b32 [param1+0], [[B]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3];
+;
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+;
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+ <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
+; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
+; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+ <2 x float> %c, <2 x float> %d) #0 {
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp une <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ueq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ugt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ult <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ule <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uno <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp one <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oeq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ogt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp olt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ole <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ord <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+ %r = uitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+ %r = uitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+ %r = sitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+ %r = sitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]];
+
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = uitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]];
+;
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = sitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+ %r = fptrunc <2 x float> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
+; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+ %r = fptrunc <2 x double> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x double>
+ ret <2 x double> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]]
+; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16
+; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]]
+; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+ %r = bitcast <2 x half> %a to <2 x i16>
+ ret <2 x i16> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]];
+; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]];
+; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16;
+; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+ %r = bitcast <2 x i16> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+
+declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
+declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
+; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
+; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x half> @test_exp(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x half> @test_exp2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x half> @test_log(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x half> @test_log10(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x half> @test_log2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret
+define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fabs(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16;
+; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+ %tb = fptrunc <2 x float> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+ %tb = fptrunc <2 x double> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]];
+; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]];
+; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
+; CHECK: ret;
+define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ %xr = fpext <2 x half> %r to <2 x float>
+ ret <2 x float> %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_floor(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_ceil(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_trunc(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_rint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_round(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll index 6785a01827e2..351f9b20dc0c 100644 --- a/test/CodeGen/NVPTX/fma.ll +++ b/test/CodeGen/NVPTX/fma.ll @@ -1,42 +1,42 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s - -declare float @dummy_f32(float, float) #0 -declare double @dummy_f64(double, double) #0 - -define ptx_device float @t1_f32(float %x, float %y, float %z) { -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; - %a = fmul float %x, %y - %b = fadd float %a, %z - ret float %b -} - -define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; - %a = fmul float %x, %y - %b = fadd float %a, %z - %c = fadd float %a, %w - %d = call float @dummy_f32(float %b, float %c) - ret float %d -} - -define ptx_device double @t1_f64(double %x, double %y, double %z) { -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %a = fmul double %x, %y - %b = fadd double %a, %z - ret double %b -} - -define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %a = fmul double %x, %y - %b = fadd double %a, %z - %c = fadd double %a, %w - %d = call double @dummy_f64(double %b, double %c) - ret double %d -} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
+
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul float %x, %y
+ %b = fadd float %a, %z
+ ret float %b
+}
+
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul float %x, %y
+ %b = fadd float %a, %z
+ %c = fadd float %a, %w
+ %d = call float @dummy_f32(float %b, float %c)
+ ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ ret double %b
+}
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ %c = fadd double %a, %w
+ %d = call double @dummy_f64(double %b, double %c)
+ ret double %d
+}
diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll index 6a1e3a0e1a0d..c41da0eebd1f 100644 --- a/test/CodeGen/NVPTX/i8-param.ll +++ b/test/CodeGen/NVPTX/i8-param.ll @@ -1,23 +1,23 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -; CHECK: .visible .func (.param .b32 func_retval0) callee -define i8 @callee(i8 %a) { -; CHECK: ld.param.u8 - %ret = add i8 %a, 42 -; CHECK: st.param.b32 - ret i8 %ret -} - -; CHECK: .visible .func caller -define void @caller(i8* %a) { -; CHECK: ld.u8 - %val = load i8, i8* %a - %ret = tail call i8 @callee(i8 %val) -; CHECK: ld.param.b32 - store i8 %ret, i8* %a - ret void -} - - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK: .visible .func (.param .b32 func_retval0) callee
+define i8 @callee(i8 %a) {
+; CHECK: ld.param.u8
+ %ret = add i8 %a, 42
+; CHECK: st.param.b32
+ ret i8 %ret
+}
+
+; CHECK: .visible .func caller
+define void @caller(i8* %a) {
+; CHECK: ld.u8
+ %val = load i8, i8* %a
+ %ret = tail call i8 @callee(i8 %val)
+; CHECK: ld.param.b32
+ store i8 %ret, i8* %a
+ ret void
+}
+
+
diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll index 8a67567acc96..83991a2930a8 100644 --- a/test/CodeGen/NVPTX/param-load-store.ll +++ b/test/CodeGen/NVPTX/param-load-store.ll @@ -1,939 +1,939 @@ -; Verifies correctness of load/store of parameters and return values. -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s - -%s_i1 = type { i1 } -%s_i8 = type { i8 } -%s_i16 = type { i16 } -%s_f16 = type { half } -%s_i32 = type { i32 } -%s_f32 = type { float } -%s_i64 = type { i64 } -%s_f64 = type { double } - -; More complicated types. i64 is used to increase natural alignment -; requirement for the type. -%s_i32x4 = type { i32, i32, i32, i32, i64} -%s_i32f32 = type { i32, float, i32, float, i64} -%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} -%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> -%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} -; All scalar parameters must be at least 32 bits in size. -; i1 is loaded/stored as i8. - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1( -; CHECK-NEXT: .param .b32 test_i1_param_0 -; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; -; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]] -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK-NEXT: test_i1, -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i1 @test_i1(i1 %a) { - %r = tail call i1 @test_i1(i1 %a); - ret i1 %r; -} - -; Signed i1 is a somewhat special case. We only care about one bit and -; then us neg.s32 to convert it to 32-bit -1 if it's set. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1s( -; CHECK-NEXT: .param .b32 test_i1s_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; -; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; -; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i1 @test_i1s(i1 signext %a) { - %r = tail call signext i1 @test_i1s(i1 signext %a); - ret i1 %r; -} - -; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v3i1( -; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i1, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i1> @test_v3i1(<3 x i1> %a) { - %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); - ret <3 x i1> %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v4i1( -; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK: test_v4i1, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; -; CHECK-NEXT: ret; -define <4 x i1> @test_v4i1(<4 x i1> %a) { - %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); - ret <4 x i1> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v5i1( -; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; -; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i1, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i1> @test_v5i1(<5 x i1> %a) { - %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); - ret <5 x i1> %r; -} - -; Unsigned i8 is loaded directly into 32-bit register. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8( -; CHECK-NEXT: .param .b32 test_i8_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i8 @test_i8(i8 %a) { - %r = tail call i8 @test_i8(i8 %a); - ret i8 %r; -} - -; signed i8 is loaded into 16-bit register which is then sign-extended to i32. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8s( -; CHECK-NEXT: .param .b32 test_i8s_param_0 -; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; -; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8s, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; -; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? -; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i8 @test_i8s(i8 signext %a) { - %r = tail call signext i8 @test_i8s(i8 signext %a); - ret i8 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v3i8( -; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i8, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i8> @test_v3i8(<3 x i8> %a) { - %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); - ret <3 x i8> %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v4i8( -; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i8, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-NEXT: ret; -define <4 x i8> @test_v4i8(<4 x i8> %a) { - %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); - ret <4 x i8> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v5i8( -; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; -; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i8, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i8> @test_v5i8(<5 x i8> %a) { - %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); - ret <5 x i8> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16( -; CHECK-NEXT: .param .b32 test_i16_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; -; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i16 @test_i16(i16 %a) { - %r = tail call i16 @test_i16(i16 %a); - ret i16 %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16s( -; CHECK-NEXT: .param .b32 test_i16s_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; -; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16s, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i16 @test_i16s(i16 signext %a) { - %r = tail call signext i16 @test_i16s(i16 signext %a); - ret i16 %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3i16( -; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] -; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i16, -; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i16> @test_v3i16(<3 x i16> %a) { - %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); - ret <3 x i16> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4i16( -; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] -; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i16, -; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-NEXT: ret; -define <4 x i16> @test_v4i16(<4 x i16> %a) { - %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); - ret <4 x i16> %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5i16( -; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] -; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; -; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i16, -; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i16> @test_v5i16(<5 x i16> %a) { - %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); - ret <5 x i16> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_f16( -; CHECK-NEXT: .param .b32 test_f16_param_0 -; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b16 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f16, -; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]] -; CHECK-NEXT: ret; -define half @test_f16(half %a) { - %r = tail call half @test_f16(half %a); - ret half %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v2f16( -; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] -; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v2f16, -; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]] -; CHECK-NEXT: ret; -define <2 x half> @test_v2f16(<2 x half> %a) { - %r = tail call <2 x half> @test_v2f16(<2 x half> %a); - ret <2 x half> %r; -} - -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3f16( -; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] -; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; -; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v3f16, -; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; -; CHECK: ret; -define <3 x half> @test_v3f16(<3 x half> %a) { - %r = tail call <3 x half> @test_v3f16(<3 x half> %a); - ret <3 x half> %r; -} - -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4f16( -; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] -; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; -; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; -; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v4f16, -; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; -; CHECK: ret; -define <4 x half> @test_v4f16(<4 x half> %a) { - %r = tail call <4 x half> @test_v4f16(<4 x half> %a); - ret <4 x half> %r; -} - -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5f16( -; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; -; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v5f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; -; CHECK: ret; -define <5 x half> @test_v5f16(<5 x half> %a) { - %r = tail call <5 x half> @test_v5f16(<5 x half> %a); - ret <5 x half> %r; -} - -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v8f16( -; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] -; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; -; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; -; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; -; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]]; -; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v8f16, -; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; -; CHECK: ret; -define <8 x half> @test_v8f16(<8 x half> %a) { - %r = tail call <8 x half> @test_v8f16(<8 x half> %a); - ret <8 x half> %r; -} - -; CHECK:.func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v9f16( -; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0]; -; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8]; -; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b16 [param0+0], -; CHECK-DAG: st.param.v4.b16 [param0+8], -; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_v9f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8]; -; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; -; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; -; CHECK: ret; -define <9 x half> @test_v9f16(<9 x half> %a) { - %r = tail call <9 x half> @test_v9f16(<9 x half> %a); - ret <9 x half> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i32( -; CHECK-NEXT: .param .b32 test_i32_param_0 -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i32 @test_i32(i32 %a) { - %r = tail call i32 @test_i32(i32 %a); - ret i32 %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v3i32( -; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i32, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i32> @test_v3i32(<3 x i32> %a) { - %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); - ret <3 x i32> %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v4i32( -; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] -; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i32, -; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHCK-NEXT: ret; -define <4 x i32> @test_v4i32(<4 x i32> %a) { - %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); - ret <4 x i32> %r; -} - -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v5i32( -; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] -; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; -; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i32, -; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i32> @test_v5i32(<5 x i32> %a) { - %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); - ret <5 x i32> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_f32( -; CHECK-NEXT: .param .b32 test_f32_param_0 -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.f32 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define float @test_f32(float %a) { - %r = tail call float @test_f32(float %a); - ret float %r; -} - -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i64( -; CHECK-NEXT: .param .b64 test_i64_param_0 -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], [[E]]; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i64 @test_i64(i64 %a) { - %r = tail call i64 @test_i64(i64 %a); - ret i64 %r; -} - -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v3i64( -; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] -; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b64 [param0+16], [[E2]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i64> @test_v3i64(<3 x i64> %a) { - %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); - ret <3 x i64> %r; -} - -; For i64 vector loads are limited by PTX to 2 elements. -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v4i64( -; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] -; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-NEXT: ret; -define <4 x i64> @test_v4i64(<4 x i64> %a) { - %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); - ret <4 x i64> %r; -} - -; Aggregates, on the other hand, do not get extended. - -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i1( -; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i1, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i1 @test_s_i1(%s_i1 %a) { - %r = tail call %s_i1 @test_s_i1(%s_i1 %a); - ret %s_i1 %r; -} - -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i8( -; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i8, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i8 @test_s_i8(%s_i8 %a) { - %r = tail call %s_i8 @test_s_i8(%s_i8 %a); - ret %s_i8 %r; -} - -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_i16( -; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] -; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i16 @test_s_i16(%s_i16 %a) { - %r = tail call %s_i16 @test_s_i16(%s_i16 %a); - ret %s_i16 %r; -} - -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_f16( -; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_f16, -; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_f16 @test_s_f16(%s_f16 %a) { - %r = tail call %s_f16 @test_s_f16(%s_f16 %a); - ret %s_f16 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_i32( -; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i32 @test_s_i32(%s_i32 %a) { - %r = tail call %s_i32 @test_s_i32(%s_i32 %a); - ret %s_i32 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_f32( -; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.f32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_f32 @test_s_f32(%s_f32 %a) { - %r = tail call %s_f32 @test_s_f32(%s_f32 %a); - ret %s_f32 %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_s_i64( -; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.b64 [param0+0], [[E]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i64 @test_s_i64(%s_i64 %a) { - %r = tail call %s_i64 @test_s_i64(%s_i64 %a); - ret %s_i64 %r; -} - -; Fields that have different types, but identical sizes are not vectorized. -; CHECK: .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32f32( -; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] -; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; -; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; -; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; -; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; -; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; -; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; -; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32f32, -; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; -; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; -; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; -; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; -; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; -define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { - %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); - ret %s_i32f32 %r; -} - -; We do vectorize consecutive fields with matching types. -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32x4( -; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] -; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; -; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; -; CHECK: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32x4, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; -; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; - -define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { - %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); - ret %s_i32x4 %r; -} - -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) -; CHECK-LABEL: test_s_i1i32x4( -; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] -; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; -; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; -; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; -; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[32]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+8], [[E2]]; -; CHECK: st.param.b32 [param0+12], [[E3]]; -; CHECK: st.param.b32 [param0+16], [[E4]]; -; CHECK: st.param.b64 [param0+24], [[E5]]; -; CHECK: .param .align 8 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_s_i1i32x4, -; CHECK: ( -; CHECK: param0 -; CHECK: ); -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; -; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; -; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; -; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; -; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; -; CHECK: ret; - -define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { - %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); - ret %s_i8i32x4 %r; -} - -; -- All loads/stores from parameters aligned by one must be done one -; -- byte at a time. -; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) -; CHECK-LABEL: test_s_i1i32x4p( -; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; -; --- TODO -; --- Unaligned parameter store/ return value load is broken in both nvcc -; --- and llvm and needs to be fixed. -; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b32 [param0+0], -; CHECK-DAG: st.param.b32 [param0+4], -; CHECK-DAG: st.param.b8 [param0+8], -; CHECK-DAG: st.param.b32 [param0+9], -; CHECK-DAG: st.param.b32 [param0+13], -; CHECK-DAG: st.param.b64 [param0+17], -; CHECK: .param .align 1 .b8 retval0[25]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; -; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; -; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; -; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; -; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; -; CHECK-DAG: st.param.b32 [func_retval0+0], -; CHECK-DAG: st.param.b32 [func_retval0+4], -; CHECK-DAG: st.param.b8 [func_retval0+8], -; CHECK-DAG: st.param.b32 [func_retval0+9], -; CHECK-DAG: st.param.b32 [func_retval0+13], -; CHECK-DAG: st.param.b64 [func_retval0+17], - -define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { - %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); - ret %s_i8i32x4p %r; -} - -; Check that we can vectorize loads that span multiple aggregate fields. -; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) -; CHECK-LABEL: test_s_crossfield( -; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] -; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; -; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; -; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; -; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; -; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; -; CHECK: .param .align 16 .b8 param0[80]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; -; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; -; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; -; CHECK: st.param.b32 [param0+64], [[E15]]; -; CHECK: .param .align 16 .b8 retval0[80]; -; CHECK: call.uni (retval0), -; CHECK: test_s_crossfield, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; -; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; -; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; -; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; -; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; -; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; -; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; -; CHECK: ret; - -define %s_crossfield @test_s_crossfield(%s_crossfield %a) { - %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); - ret %s_crossfield %r; -} +; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck %s
+
+%s_i1 = type { i1 }
+%s_i8 = type { i8 }
+%s_i16 = type { i16 }
+%s_f16 = type { half }
+%s_i32 = type { i32 }
+%s_f32 = type { float }
+%s_i64 = type { i64 }
+%s_f64 = type { double }
+
+; More complicated types. i64 is used to increase natural alignment
+; requirement for the type.
+%s_i32x4 = type { i32, i32, i32, i32, i64}
+%s_i32f32 = type { i32, float, i32, float, i64}
+%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
+%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
+%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
+; All scalar parameters must be at least 32 bits in size.
+; i1 is loaded/stored as i8.
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1(
+; CHECK-NEXT: .param .b32 test_i1_param_0
+; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
+; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1;
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]]
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni
+; CHECK-NEXT: test_i1,
+; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i1 @test_i1(i1 %a) {
+ %r = tail call i1 @test_i1(i1 %a);
+ ret i1 %r;
+}
+
+; Signed i1 is a somewhat special case. We only care about one bit and
+; then us neg.s32 to convert it to 32-bit -1 if it's set.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1s(
+; CHECK-NEXT: .param .b32 test_i1s_param_0
+; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
+; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1;
+; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni
+; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1;
+; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i1 @test_i1s(i1 signext %a) {
+ %r = tail call signext i1 @test_i1s(i1 signext %a);
+ ret i1 %r;
+}
+
+; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
+; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
+; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b8 [param0+2], [[E2]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i1,
+; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}
+; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i1> @test_v3i1(<3 x i1> %a) {
+ %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
+ ret <3 x i1> %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
+; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK: test_v4i1,
+; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
+; CHECK-NEXT: ret;
+define <4 x i1> @test_v4i1(<4 x i1> %a) {
+ %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
+ ret <4 x i1> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i1(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
+; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
+; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i1,
+; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i1> @test_v5i1(<5 x i1> %a) {
+ %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
+ ret <5 x i1> %r;
+}
+
+; Unsigned i8 is loaded directly into 32-bit register.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8(
+; CHECK-NEXT: .param .b32 test_i8_param_0
+; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
+; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255;
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK: test_i8,
+; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i8 @test_i8(i8 %a) {
+ %r = tail call i8 @test_i8(i8 %a);
+ ret i8 %r;
+}
+
+; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8s(
+; CHECK-NEXT: .param .b32 test_i8s_param_0
+; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
+; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK: test_i8s,
+; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
+; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
+; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]];
+; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i8 @test_i8s(i8 signext %a) {
+ %r = tail call signext i8 @test_i8s(i8 signext %a);
+ ret i8 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
+; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
+; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b8 [param0+2], [[E2]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i8,
+; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i8> @test_v3i8(<3 x i8> %a) {
+ %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
+ ret <3 x i8> %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
+; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i8,
+; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i8> @test_v4i8(<4 x i8> %a) {
+ %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
+ ret <4 x i8> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i8(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
+; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i8,
+; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i8> @test_v5i8(<5 x i8> %a) {
+ %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
+ ret <5 x i8> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16(
+; CHECK-NEXT: .param .b32 test_i16_param_0
+; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0];
+; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E32]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i16,
+; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i16 @test_i16(i16 %a) {
+ %r = tail call i16 @test_i16(i16 %a);
+ ret i16 %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16s(
+; CHECK-NEXT: .param .b32 test_i16s_param_0
+; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0];
+; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E32]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i16s,
+; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i16 @test_i16s(i16 signext %a) {
+ %r = tail call signext i16 @test_i16s(i16 signext %a);
+ ret i16 %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
+; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
+; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b16 [param0+4], [[E2]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i16,
+; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i16> @test_v3i16(<3 x i16> %a) {
+ %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
+ ret <3 x i16> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
+; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i16,
+; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i16> @test_v4i16(<4 x i16> %a) {
+ %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
+ ret <4 x i16> %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5i16(
+; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
+; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
+; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i16,
+; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i16> @test_v5i16(<5 x i16> %a) {
+ %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
+ ret <5 x i16> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_f16(
+; CHECK-NEXT: .param .b32 test_f16_param_0
+; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b16 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_f16,
+; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define half @test_f16(half %a) {
+ %r = tail call half @test_f16(half %a);
+ ret half %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v2f16(
+; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
+; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0];
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v2f16,
+; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define <2 x half> @test_v2f16(<2 x half> %a) {
+ %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
+ ret <2 x half> %r;
+}
+
+; CHECK:.func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3f16(
+; CHECK: .param .align 8 .b8 test_v3f16_param_0[8]
+; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK: test_v3f16,
+; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]];
+; CHECK: ret;
+define <3 x half> @test_v3f16(<3 x half> %a) {
+ %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
+ ret <3 x half> %r;
+}
+
+; CHECK:.func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4f16(
+; CHECK: .param .align 8 .b8 test_v4f16_param_0[8]
+; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
+; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK: test_v4f16,
+; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
+; CHECK: ret;
+define <4 x half> @test_v4f16(<4 x half> %a) {
+ %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
+ ret <4 x half> %r;
+}
+
+; CHECK:.func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5f16(
+; CHECK: .param .align 16 .b8 test_v5f16_param_0[16]
+; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b16 [param0+0],
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK: test_v5f16,
+; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]];
+; CHECK: ret;
+define <5 x half> @test_v5f16(<5 x half> %a) {
+ %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
+ ret <5 x half> %r;
+}
+
+; CHECK:.func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v8f16(
+; CHECK: .param .align 16 .b8 test_v8f16_param_0[16]
+; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
+; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]];
+; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK: test_v8f16,
+; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
+; CHECK: ret;
+define <8 x half> @test_v8f16(<8 x half> %a) {
+ %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
+ ret <8 x half> %r;
+}
+
+; CHECK:.func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v9f16(
+; CHECK: .param .align 32 .b8 test_v9f16_param_0[32]
+; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK-DAG: st.param.v4.b16 [param0+0],
+; CHECK-DAG: st.param.v4.b16 [param0+8],
+; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK: test_v9f16,
+; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
+; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
+; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]];
+; CHECK: ret;
+define <9 x half> @test_v9f16(<9 x half> %a) {
+ %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
+ ret <9 x half> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i32(
+; CHECK-NEXT: .param .b32 test_i32_param_0
+; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i32,
+; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i32 @test_i32(i32 %a) {
+ %r = tail call i32 @test_i32(i32 %a);
+ ret i32 %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v3i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b32 [param0+8], [[E2]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i32,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i32> @test_v3i32(<3 x i32> %a) {
+ %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
+ ret <3 x i32> %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v4i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
+; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i32,
+; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHCK-NEXT: ret;
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+ %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
+ ret <4 x i32> %r;
+}
+
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v5i32(
+; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
+; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
+; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i32,
+; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i32> @test_v5i32(<5 x i32> %a) {
+ %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
+ ret <5 x i32> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_f32(
+; CHECK-NEXT: .param .b32 test_f32_param_0
+; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.f32 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_f32,
+; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_f32(float %a) {
+ %r = tail call float @test_f32(float %a);
+ ret float %r;
+}
+
+; CHECK: .func (.param .b64 func_retval0)
+; CHECK-LABEL: test_i64(
+; CHECK-NEXT: .param .b64 test_i64_param_0
+; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0];
+; CHECK: .param .b64 param0;
+; CHECK: st.param.b64 [param0+0], [[E]];
+; CHECK: .param .b64 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i64,
+; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i64 @test_i64(i64 %a) {
+ %r = tail call i64 @test_i64(i64 %a);
+ ret i64 %r;
+}
+
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v3i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
+; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
+; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b64 [param0+16], [[E2]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i64,
+; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i64> @test_v3i64(<3 x i64> %a) {
+ %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
+ ret <3 x i64> %r;
+}
+
+; For i64 vector loads are limited by PTX to 2 elements.
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v4i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
+; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
+; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i64,
+; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]};
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-NEXT: ret;
+define <4 x i64> @test_v4i64(<4 x i64> %a) {
+ %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
+ ret <4 x i64> %r;
+}
+
+; Aggregates, on the other hand, do not get extended.
+
+; CHECK: .func (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i1(
+; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
+; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
+; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: st.param.b8 [param0+0], [[A]]
+; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i1,
+; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b8 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i1 @test_s_i1(%s_i1 %a) {
+ %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
+ ret %s_i1 %r;
+}
+
+; CHECK: .func (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i8(
+; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
+; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
+; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: st.param.b8 [param0+0], [[A]]
+; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i8,
+; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b8 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i8 @test_s_i8(%s_i8 %a) {
+ %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
+ ret %s_i8 %r;
+}
+
+; CHECK: .func (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_i16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
+; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
+; CHECK: .param .align 2 .b8 param0[2];
+; CHECK: st.param.b16 [param0+0], [[A]]
+; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i16,
+; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i16 @test_s_i16(%s_i16 %a) {
+ %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
+ ret %s_i16 %r;
+}
+
+; CHECK: .func (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_f16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
+; CHECK: .param .align 2 .b8 param0[2];
+; CHECK: st.param.b16 [param0+0], [[A]]
+; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_f16,
+; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f16 @test_s_f16(%s_f16 %a) {
+ %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
+ ret %s_f16 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_i32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
+; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0];
+; CHECK: .param .align 4 .b8 param0[4]
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32,
+; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i32 @test_s_i32(%s_i32 %a) {
+ %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
+ ret %s_i32 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_f32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
+; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK: .param .align 4 .b8 param0[4]
+; CHECK: st.param.f32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_f32,
+; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f32 @test_s_f32(%s_f32 %a) {
+ %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
+ ret %s_f32 %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_s_i64(
+; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
+; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.b64 [param0+0], [[E]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i64,
+; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i64 @test_s_i64(%s_i64 %a) {
+ %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
+ ret %s_i64 %r;
+}
+
+; Fields that have different types, but identical sizes are not vectorized.
+; CHECK: .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32f32(
+; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24]
+; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
+; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
+; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
+; CHECK: .param .align 8 .b8 param0[24];
+; CHECK-DAG: st.param.b32 [param0+0], [[E0]];
+; CHECK-DAG: st.param.f32 [param0+4], [[E1]];
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
+; CHECK-DAG: st.param.f32 [param0+12], [[E3]];
+; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32f32,
+; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]];
+; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]];
+; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]];
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
+; CHECK: ret;
+define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
+ %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
+ ret %s_i32f32 %r;
+}
+
+; We do vectorize consecutive fields with matching types.
+; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32x4(
+; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24]
+; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
+; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
+; CHECK: .param .align 8 .b8 param0[24];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK: st.param.b64 [param0+16], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32x4,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
+; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
+; CHECK: ret;
+
+define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
+ %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
+ ret %s_i32x4 %r;
+}
+
+; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i1i32x4(
+; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32]
+; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
+; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
+; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
+; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
+; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
+; CHECK: .param .align 8 .b8 param0[32];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b8 [param0+8], [[E2]];
+; CHECK: st.param.b32 [param0+12], [[E3]];
+; CHECK: st.param.b32 [param0+16], [[E4]];
+; CHECK: st.param.b64 [param0+24], [[E5]];
+; CHECK: .param .align 8 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK: test_s_i1i32x4,
+; CHECK: (
+; CHECK: param0
+; CHECK: );
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8];
+; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12];
+; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK: st.param.b8 [func_retval0+8], [[RE2]];
+; CHECK: st.param.b32 [func_retval0+12], [[RE3]];
+; CHECK: st.param.b32 [func_retval0+16], [[RE4]];
+; CHECK: st.param.b64 [func_retval0+24], [[RE5]];
+; CHECK: ret;
+
+define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
+ %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
+ ret %s_i8i32x4 %r;
+}
+
+; -- All loads/stores from parameters aligned by one must be done one
+; -- byte at a time.
+; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25])
+; CHECK-LABEL: test_s_i1i32x4p(
+; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0];
+; --- TODO
+; --- Unaligned parameter store/ return value load is broken in both nvcc
+; --- and llvm and needs to be fixed.
+; CHECK: .param .align 1 .b8 param0[25];
+; CHECK-DAG: st.param.b32 [param0+0],
+; CHECK-DAG: st.param.b32 [param0+4],
+; CHECK-DAG: st.param.b8 [param0+8],
+; CHECK-DAG: st.param.b32 [param0+9],
+; CHECK-DAG: st.param.b32 [param0+13],
+; CHECK-DAG: st.param.b64 [param0+17],
+; CHECK: .param .align 1 .b8 retval0[25];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i1i32x4p,
+; CHECK-DAG: ld.param.b32 %r41, [retval0+0];
+; CHECK-DAG: ld.param.b32 %r42, [retval0+4];
+; CHECK-DAG: ld.param.b8 %rs2, [retval0+8];
+; CHECK-DAG: ld.param.b32 %r43, [retval0+9];
+; CHECK-DAG: ld.param.b32 %r44, [retval0+13];
+; CHECK-DAG: ld.param.b64 %rd23, [retval0+17];
+; CHECK-DAG: st.param.b32 [func_retval0+0],
+; CHECK-DAG: st.param.b32 [func_retval0+4],
+; CHECK-DAG: st.param.b8 [func_retval0+8],
+; CHECK-DAG: st.param.b32 [func_retval0+9],
+; CHECK-DAG: st.param.b32 [func_retval0+13],
+; CHECK-DAG: st.param.b64 [func_retval0+17],
+
+define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
+ %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
+ ret %s_i8i32x4p %r;
+}
+
+; Check that we can vectorize loads that span multiple aggregate fields.
+; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80])
+; CHECK-LABEL: test_s_crossfield(
+; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80]
+; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
+; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
+; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
+; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
+; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
+; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
+; CHECK: .param .align 16 .b8 param0[80];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b32 [param0+8], [[E2]];
+; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK: st.param.b32 [param0+64], [[E15]];
+; CHECK: .param .align 16 .b8 retval0[80];
+; CHECK: call.uni (retval0),
+; CHECK: test_s_crossfield,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
+; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
+; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
+; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
+; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
+; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
+; CHECK: st.param.b32 [func_retval0+64], [[RE15]];
+; CHECK: ret;
+
+define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
+ %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
+ ret %s_crossfield %r;
+}
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll index fb01eb262adc..ecdf55ecdbeb 100644 --- a/test/CodeGen/NVPTX/sched1.ll +++ b/test/CodeGen/NVPTX/sched1.ll @@ -6,10 +6,10 @@ define void @foo(i32* %a) { ; CHECK: .func foo ; CHECK: ld.u32 ; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr i32, i32* %a, i32 0 %val0 = load i32, i32* %ptr0 diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll index 91ed77878f81..347f77c5682c 100644 --- a/test/CodeGen/NVPTX/sched2.ll +++ b/test/CodeGen/NVPTX/sched2.ll @@ -4,12 +4,12 @@ define void @foo(<2 x i32>* %a) { ; CHECK: .func foo ; CHECK: ld.v2.u32 ; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0 diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll index da6568685fe6..8ff0b5da5bcc 100644 --- a/test/CodeGen/NVPTX/simple-call.ll +++ b/test/CodeGen/NVPTX/simple-call.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s - - - -; CHECK: .func ({{.*}}) device_func -define float @device_func(float %a) noinline { - %ret = fmul float %a, %a - ret float %ret -} - -; CHECK: .entry kernel_func -define void @kernel_func(float* %a) { - %val = load float, float* %a -; CHECK: call.uni (retval0), -; CHECK: device_func, - %mul = call float @device_func(float %val) - store float %mul, float* %a - ret void -} - - - -!nvvm.annotations = !{!1} - -!1 = !{void (float*)* @kernel_func, !"kernel", i32 1} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+
+
+; CHECK: .func ({{.*}}) device_func
+define float @device_func(float %a) noinline {
+ %ret = fmul float %a, %a
+ ret float %ret
+}
+
+; CHECK: .entry kernel_func
+define void @kernel_func(float* %a) {
+ %val = load float, float* %a
+; CHECK: call.uni (retval0),
+; CHECK: device_func,
+ %mul = call float @device_func(float %val)
+ store float %mul, float* %a
+ ret void
+}
+
+
+
+!nvvm.annotations = !{!1}
+
+!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll index a86ba1e29d5c..93b39c1125f8 100644 --- a/test/CodeGen/NVPTX/vec8.ll +++ b/test/CodeGen/NVPTX/vec8.ll @@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) { ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] ; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] -; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK-DAG: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; ; CHECK: st.u8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6 diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll index bf7b931a5758..d1ec8d25a107 100644 --- a/test/CodeGen/NVPTX/vector-call.ll +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -1,30 +1,30 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -target triple = "nvptx-unknown-cuda" - -declare void @bar(<4 x i32>) - -; CHECK-LABEL: .func foo( -; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: call.uni -; CHECK: ret; -define void @foo(<4 x i32> %a) { - tail call void @bar(<4 x i32> %a) - ret void -} - -; CHECK-LABEL: .func foo3( -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK: call.uni -; CHECK: ret; -declare void @bar3(<3 x i32>) -define void @foo3(<3 x i32> %a) { - tail call void @bar3(<3 x i32> %a) - ret void -} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL: .func foo(
+; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: call.uni
+; CHECK: ret;
+define void @foo(<4 x i32> %a) {
+ tail call void @bar(<4 x i32> %a)
+ ret void
+}
+
+; CHECK-LABEL: .func foo3(
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
+; CHECK: call.uni
+; CHECK: ret;
+declare void @bar3(<3 x i32>)
+define void @foo3(<3 x i32> %a) {
+ tail call void @bar3(<3 x i32> %a)
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/zeroext-32bit.ll b/test/CodeGen/NVPTX/zeroext-32bit.ll index c2f0ec4b1447..bcfd987b4a66 100644 --- a/test/CodeGen/NVPTX/zeroext-32bit.ll +++ b/test/CodeGen/NVPTX/zeroext-32bit.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s - -; The zeroext attribute below should be silently ignored because -; we can pass a 32-bit integer across a function call without -; needing to extend it. - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64-unknown-cuda" - -; CHECK-LABEL: .visible .func zeroext_test -; CHECK-NOT: cvt.u32.u16 -define void @zeroext_test() { - tail call void @call1(i32 zeroext 0) - ret void -} - -declare void @call1(i32 zeroext) - -; CHECK-LABEL: .visible .func signext_test -; CHECK-NOT: cvt.s32.s16 -define void @signext_test() { - tail call void @call2(i32 zeroext 0) - ret void -} - -declare void @call2(i32 zeroext) +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
+
+; The zeroext attribute below should be silently ignored because
+; we can pass a 32-bit integer across a function call without
+; needing to extend it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-cuda"
+
+; CHECK-LABEL: .visible .func zeroext_test
+; CHECK-NOT: cvt.u32.u16
+define void @zeroext_test() {
+ tail call void @call1(i32 zeroext 0)
+ ret void
+}
+
+declare void @call1(i32 zeroext)
+
+; CHECK-LABEL: .visible .func signext_test
+; CHECK-NOT: cvt.s32.s16
+define void @signext_test() {
+ tail call void @call2(i32 zeroext 0)
+ ret void
+}
+
+declare void @call2(i32 zeroext)
diff --git a/test/CodeGen/PowerPC/mtvsrdd.ll b/test/CodeGen/PowerPC/mtvsrdd.ll new file mode 100644 index 000000000000..1d6a3553b2a1 --- /dev/null +++ b/test/CodeGen/PowerPC/mtvsrdd.ll @@ -0,0 +1,22 @@ +; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \ +; RUN: < %s | FileCheck %s + +; This test case checks r0 is used as constant 0 in instruction mtvsrdd. + +define <2 x i64> @const0(i64 %a) { + %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecinit1 = insertelement <2 x i64> %vecinit, i64 0, i32 1 + ret <2 x i64> %vecinit1 +; CHECK-LABEL: const0 +; CHECK: mtvsrdd v2, 0, r3 +} + +define <2 x i64> @noconst0(i64* %a, i64* %b) { + %1 = load i64, i64* %a, align 8 + %2 = load i64, i64* %b, align 8 + %vecinit = insertelement <2 x i64> undef, i64 %2, i32 0 + %vecinit1 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 + ret <2 x i64> %vecinit1 +; CHECK-LABEL: noconst0 +; CHECK: mtvsrdd v2, {{r[0-9]+}}, {{r[0-9]+}} +} diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll index 2ed08e2ae380..a5a86f101a94 100644 --- a/test/CodeGen/PowerPC/setcc-logic.ll +++ b/test/CodeGen/PowerPC/setcc-logic.ll @@ -6,7 +6,7 @@ define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) { ; CHECK: # BB#0: ; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %a = icmp eq i32 %P, 0 %b = icmp eq i32 %Q, 0 @@ -30,11 +30,11 @@ define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) { define zeroext i1 @all_bits_set(i32 %P, i32 %Q) { ; CHECK-LABEL: all_bits_set: ; CHECK: # BB#0: +; CHECK-NEXT: li 5, -1 ; CHECK-NEXT: and 3, 3, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 12, 1 -; CHECK-NEXT: cmpwi 0, 3, -1 -; CHECK-NEXT: isel 3, 12, 5, 2 +; CHECK-NEXT: xor 3, 3, 5 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %a = icmp eq i32 %P, -1 %b = icmp eq i32 %Q, -1 @@ -437,7 +437,7 @@ define zeroext i1 @and_eq(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 z ; CHECK-NEXT: xor 3, 3, 4 ; CHECK-NEXT: or 3, 3, 5 ; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %cmp1 = icmp eq i16 %a, %b %cmp2 = icmp eq i16 %c, %d diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll index b5f1d4cfe4bc..b677b8be2966 100644 --- a/test/CodeGen/PowerPC/stackmap-frame-setup.ll +++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN 0, implicit-def +; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def ret void diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll index c9b5bf8c9eeb..9665901e874f 100644 --- a/test/CodeGen/PowerPC/tail-dup-layout.ll +++ b/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,4 +1,5 @@ -; RUN: llc -O2 < %s | FileCheck %s +; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s +; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" @@ -99,11 +100,9 @@ exit: ; test1 ; test2 ; test3 -; test4 ; optional1 ; optional2 ; optional3 -; optional4 ; exit ; even for 50/50 branches. ; Tail duplication puts test n+1 at the end of optional n @@ -163,6 +162,98 @@ exit: } ; Intended layout: +; The chain-of-triangles based duplicating produces the layout when 3 +; instructions are allowed for tail-duplication. +; test1 +; test2 +; test3 +; optional1 +; optional2 +; optional3 +; exit +; +; Otherwise it produces the layout: +; test1 +; optional1 +; test2 +; optional2 +; test3 +; optional3 +; exit + +;CHECK-LABEL: straight_test_3_instr_test: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30 +;CHECK-NEXT: cmplwi {{[0-9]+}}, 2 + +;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: # %test2 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O3: blr +;CHECK-O3-NEXT: .[[OPT1LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-O3-NEXT: .[[OPT2LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-O3-NEXT: .[[OPT3LABEL]]: +;CHECK-O3: b .[[EXITLABEL]] + +;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional1 +;CHECK-O2: .[[TEST2LABEL]]: # %test2 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional2 +;CHECK-O2: .[[TEST3LABEL]]: # %test3 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional3 +;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O2: blr + + +define void @straight_test_3_instr_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 3 + %tagbit1eq0 = icmp eq i32 %tagbit1, 2 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 12 + %tagbit2eq0 = icmp eq i32 %tagbit2, 8 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 48 + %tagbit3eq0 = icmp eq i32 %tagbit3, 32 + br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1 +optional3: + call void @c() + br label %exit +exit: + ret void +} + +; Intended layout: ; The chain-based outlining produces the layout ; entry ; --- Begin loop --- diff --git a/test/CodeGen/PowerPC/testComparesieqsc.ll b/test/CodeGen/PowerPC/testComparesieqsc.ll new file mode 100644 index 000000000000..71ad5ed34969 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_sext(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_z(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_sext_z(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_sext_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_z_store(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_sext_z_store(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesieqsi.ll b/test/CodeGen/PowerPC/testComparesieqsi.ll new file mode 100644 index 000000000000..16882dbd0045 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsi.c' + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_sext(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_z(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_sext_z(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_sext_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_z_store(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_sext_z_store(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesieqss.ll b/test/CodeGen/PowerPC/testComparesieqss.ll new file mode 100644 index 000000000000..110c5a62804e --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqss.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqss.c' + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_sext(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_z(i16 signext %a) { +; CHECK-LABEL: test_ieqss_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_sext_z(i16 signext %a) { +; CHECK-LABEL: test_ieqss_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_sext_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_z_store(i16 signext %a) { +; CHECK-LABEL: test_ieqss_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_sext_z_store(i16 signext %a) { +; CHECK-LABEL: test_ieqss_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequc.ll b/test/CodeGen/PowerPC/testComparesiequc.ll new file mode 100644 index 000000000000..e2c975f2c191 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_sext(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_z(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_sext_z(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_sext_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_sext_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequi.ll b/test/CodeGen/PowerPC/testComparesiequi.ll new file mode 100644 index 000000000000..789b176a7700 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequi.c' + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_sext(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_z(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_sext_z(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_sext_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_sext_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequs.ll b/test/CodeGen/PowerPC/testComparesiequs.ll new file mode 100644 index 000000000000..b72943893e98 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequs.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequs.c' + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_sext(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_z(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_sext_z(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_sext_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_sext_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsc.ll b/test/CodeGen/PowerPC/testCompareslleqsc.ll new file mode 100644 index 000000000000..56af12827931 --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testCompareslleqsc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_sext(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_z(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_sext_z(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_sext_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_z_store(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_sext_z_store(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsi.ll b/test/CodeGen/PowerPC/testCompareslleqsi.ll new file mode 100644 index 000000000000..90cf2c85888e --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_sext(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_z(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_sext_z(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_sext_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_z_store(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +; CHECKNEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_sext_z_store(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqss.ll b/test/CodeGen/PowerPC/testCompareslleqss.ll new file mode 100644 index 000000000000..df60a6ccc00e --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqss.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_sext(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_z(i16 signext %a) { +; CHECK-LABEL: test_lleqss_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_sext_z(i16 signext %a) { +; CHECK-LABEL: test_lleqss_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_sext_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_z_store(i16 signext %a) { +; CHECK-LABEL: test_lleqss_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_sext_z_store(i16 signext %a) { +; CHECK-LABEL: test_lleqss_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequc.ll b/test/CodeGen/PowerPC/testComparesllequc.ll new file mode 100644 index 000000000000..248825761295 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequc.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_sext(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_z(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_sext_z(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_sext_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_sext_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequi.ll b/test/CodeGen/PowerPC/testComparesllequi.ll new file mode 100644 index 000000000000..2342d80d94ef --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequi.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_sext(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_z(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_sext_z(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_sext_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_sext_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequs.ll b/test/CodeGen/PowerPC/testComparesllequs.ll new file mode 100644 index 000000000000..e79a974c06f5 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequs.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_sext(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_z(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_sext_z(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_sext_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_sext_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll index 87e0c4621c08..d586fe183a92 100644 --- a/test/CodeGen/SPARC/LeonItinerariesUT.ll +++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll @@ -28,9 +28,9 @@ ; LEON3_4_ITIN-LABEL: f32_ops: ; LEON3_4_ITIN: ld ; LEON3_4_ITIN-NEXT: ld -; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fadds ; LEON3_4_ITIN-NEXT: ld +; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fsubs ; LEON3_4_ITIN-NEXT: fmuls ; LEON3_4_ITIN-NEXT: retl @@ -47,4 +47,4 @@ entry: %6 = fmul float %5, %3 %7 = fdiv float %6, %4 ret float %7 -}
\ No newline at end of file +} diff --git a/test/CodeGen/SPARC/inlineasm-v9.ll b/test/CodeGen/SPARC/inlineasm-v9.ll new file mode 100644 index 000000000000..9c5424c46229 --- /dev/null +++ b/test/CodeGen/SPARC/inlineasm-v9.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=sparcv9 <%s | FileCheck %s + +;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints. +; CHECK-LABEL: faddd: +; CHECK: faddd %f0, %f2, %f0 +define double @faddd(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7 + ret double %2 +} + +; CHECK-LABEL: faddq: +; CHECK: faddq %f0, %f4, %f0 +define fp128 @faddq(fp128, fp128) local_unnamed_addr #2 { +entry: + %2 = tail call fp128 asm sideeffect "faddq $1, $2, $0;", "=f,f,e"(fp128 %0, fp128 %1) #7 + ret fp128 %2 +} + +;; Ensure that 'e' can indeed go in the high area, and 'f' cannot. +; CHECK-LABEL: faddd_high: +; CHECK: fmovd %f2, %f32 +; CHECK: fmovd %f0, %f2 +; CHECK: faddd %f2, %f32, %f2 +define double @faddd_high(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e,~{d0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(double %0, double %1) #7 + ret double %2 +} + diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll index af631f0d29f5..35a62706c1ab 100644 --- a/test/CodeGen/SPARC/inlineasm.ll +++ b/test/CodeGen/SPARC/inlineasm.ll @@ -94,3 +94,21 @@ entry: %0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5); ret i64 %0 } + + +;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints. +; CHECK-LABEL: fadds: +; CHECK: fadds %f0, %f1, %f0 +define float @fadds(float, float) local_unnamed_addr #2 { +entry: + %2 = tail call float asm sideeffect "fadds $1, $2, $0;", "=f,f,e"(float %0, float %1) #7 + ret float %2 +} + +; CHECK-LABEL: faddd: +; CHECK: faddd %f0, %f2, %f0 +define double @faddd(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7 + ret double %2 +} diff --git a/test/CodeGen/SystemZ/list-ilp-crash.ll b/test/CodeGen/SystemZ/list-ilp-crash.ll new file mode 100644 index 000000000000..c67ed318b93f --- /dev/null +++ b/test/CodeGen/SystemZ/list-ilp-crash.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -pre-RA-sched=list-ilp | FileCheck %s +; +; Check that list-ilp scheduler does not crash due to SystemZ's current use +; of MVT::Untyped. + +define void @pr32723(i8) { +; CHECK: .text +BB: + br label %CF245 + +CF245: ; preds = %CF245, %BB + %Shuff57 = shufflevector <4 x i8> zeroinitializer, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %Cmp84 = icmp uge i8 %0, undef + br i1 %Cmp84, label %CF245, label %CF260 + +CF260: ; preds = %CF245 + %B156 = sdiv <4 x i8> %Shuff57, %Shuff57 + br label %CF255 + +CF255: ; preds = %CF255, %CF260 + %I186 = insertelement <4 x i8> %B156, i8 %0, i32 2 + br label %CF255 +} diff --git a/test/CodeGen/SystemZ/lower-copy-undef-src.mir b/test/CodeGen/SystemZ/lower-copy-undef-src.mir new file mode 100644 index 000000000000..322460d79d68 --- /dev/null +++ b/test/CodeGen/SystemZ/lower-copy-undef-src.mir @@ -0,0 +1,14 @@ +# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s +# +# Test that a COPY with an undef source operand gets handled like an identity +# copy rather than lowered into a target instruction with the undef flag +# dropped. +--- +# CHECK-LABEL: name: undef_copy +# CHECK: %r13d = KILL undef %r0d, implicit killed %r12q, implicit-def %r12q +name: undef_copy +tracksRegLiveness: true +body: | + bb.0: + liveins: %r12q + %r13d = COPY undef %r0d, implicit killed %r12q, implicit-def %r12q diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll index d8d60413cb0e..5e7a40299ed7 100644 --- a/test/CodeGen/Thumb2/v8_IT_5.ll +++ b/test/CodeGen/Thumb2/v8_IT_5.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: b ; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173 ; CHECK-NEXT: mov.w -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop ; CHECK-NEXT: %if.else145 ; CHECK-NEXT: mov.w diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll index 4ec703921e29..24aa5b98d0bb 100644 --- a/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ define float @foo(float %x) nounwind { ; CHECK: mulss ; CHECK: mulss -; CHECK: mulss -; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll index 8b11fd86ef17..ae60d57bbf49 100644 --- a/test/CodeGen/X86/2010-01-18-DbgValue.ll +++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll @@ -1,14 +1,19 @@ -; RUN: llc -march=x86 -O0 < %s | FileCheck %s -; Currently, dbg.declare generates a DEBUG_VALUE comment. Eventually it will -; generate DWARF and this test will need to be modified or removed. +; RUN: llc -march=x86 -O0 < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s +; CHECK-LABEL: .debug_info contents: + +; CHECK-LABEL: DW_TAG_subprogram +; CHECK: DW_AT_name [DW_FORM_strp] ( {{.*}}"foo") +; CHECK: DW_TAG_formal_parameter +; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 {{..}} ) +; DW_OP_fbreg ?? +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"my_r0") %struct.Pt = type { double, double } %struct.Rect = type { %struct.Pt, %struct.Pt } define double @foo(%struct.Rect* byval %my_r0) nounwind ssp !dbg !1 { entry: -;CHECK: DEBUG_VALUE %retval = alloca double ; <double*> [#uses=2] %0 = alloca double ; <double*> [#uses=2] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll deleted file mode 100644 index 495ff0304b1b..000000000000 --- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test LiveInterval update handling of DBG_VALUE. -; rdar://12777252. -; -; CHECK: %entry -; CHECK: DEBUG_VALUE: subdivp:hg -; CHECK: j - -%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 } -%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] } -%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* } - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp !dbg !14 { -entry: - call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !14) - %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0 - %0 = load i16, i16* %type, align 2 - %cmp = icmp eq i16 %0, 1 - br i1 %cmp, label %return, label %for.cond.preheader - -for.cond.preheader: ; preds = %entry - %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29, %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1 - %cmp22 = fcmp olt double 0.000000e+00, %dsq - %conv24 = zext i1 %cmp22 to i16 - br label %return - -return: ; preds = %for.cond.preheader, %entry - %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ] - ret i16 %retval.0 -} - -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!12} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2) -!2 = !{} -!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6) -!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh") -!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "hgstruct", line: 492, file: !11, baseType: !7) -!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11) -!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh") -!12 = !{i32 1, !"Debug Info Version", i32 3} -!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15) -!15 = !DISubroutineType(types: !16) -!16 = !{null} diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll deleted file mode 100644 index fbe6000d7ace..000000000000 --- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll +++ /dev/null @@ -1,142 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test MachineScheduler handling of DBG_VALUE. -; rdar://12776937. -; -; CHECK: %if.else581 -; CHECK: DEBUG_VALUE: num1 -; CHECK: call - -%union.rec = type {} - -@.str15 = external hidden unnamed_addr constant [6 x i8], align 1 - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp !dbg !21 { -entry: - %num14075 = alloca [20 x i8], align 16 - br label %if.end33 - -if.end33: ; preds = %entry - %cmp1733 = icmp eq i32 undef, 0 - br label %if.else581 - -if.else581: ; preds = %if.end33 - %cmp586 = icmp eq i8 undef, -123 - br i1 %cmp586, label %if.then588, label %if.else594 - -if.then588: ; preds = %if.else581 - br label %for.cond1710.preheader - -if.else594: ; preds = %if.else581 - unreachable - -for.cond1710.preheader: ; preds = %if.then588 - br label %for.cond1710 - -for.cond1710: ; preds = %for.cond1710, %for.cond1710.preheader - br i1 undef, label %for.cond1710, label %if.then3344 - -if.then3344: - br label %if.then4073 - -if.then4073: ; preds = %if.then3344 - call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !5) - %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0 - %0 = load i32, i32* undef, align 4 - %add4093 = add nsw i32 %0, 0 - %conv4094 = sitofp i32 %add4093 to float - %div4095 = fdiv float %conv4094, 5.670000e+02 - %conv4096 = fpext float %div4095 to double - %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind - br i1 %cmp1733, label %if.then4107, label %if.else4114 - -if.then4107: ; preds = %if.then4073 - unreachable - -if.else4114: ; preds = %if.then4073 - unreachable -} - -declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...) - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!35} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2) -!1 = !{!2} -!2 = !{} -!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15) -!5 = distinct !DILexicalBlock(line: 815, column: 0, file: !14, scope: !6) -!6 = distinct !DILexicalBlock(line: 812, column: 0, file: !14, scope: !7) -!7 = distinct !DILexicalBlock(line: 807, column: 0, file: !14, scope: !8) -!8 = distinct !DILexicalBlock(line: 440, column: 0, file: !14, scope: !9) -!9 = distinct !DILexicalBlock(line: 435, column: 0, file: !14, scope: !10) -!10 = distinct !DILexicalBlock(line: 434, column: 0, file: !14, scope: !11) -!11 = distinct !DILexicalBlock(line: 250, column: 0, file: !14, scope: !12) -!12 = distinct !DILexicalBlock(line: 249, column: 0, file: !14, scope: !13) -!13 = distinct !DILexicalBlock(line: 221, column: 0, file: !14, scope: !21) -!14 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset") -!15 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !16, elements: !17) -!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char) -!17 = !{!18} -!18 = !DISubrange(count: 20) -!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset") - -!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22) -!22 = !DISubroutineType(types: !23) -!23 = !{null} - -; Test DebugValue uses visited by RegisterPressureTracker findUseBetween(). -; -; CHECK: @main -; CHECK: DEBUG_VALUE: main:X -; CHECK: call - -%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" } -%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 } - -define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !37 { -entry: - %X = alloca %"class.__gnu_cxx::hash_map", align 8 - br i1 undef, label %cond.true, label %cond.end - -cond.true: ; preds = %entry - unreachable - -cond.end: ; preds = %entry - call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !DIExpression()), !dbg !DILocation(scope: !37) - %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5 - invoke void @_Znwm() - to label %exit.i unwind label %lpad2.i.i.i.i - -exit.i: ; preds = %cond.end - unreachable - -lpad2.i.i.i.i: ; preds = %cond.end - %0 = landingpad { i8*, i32 } - cleanup - br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i - -if.then.i.i.i.i.i.i.i.i: ; preds = %lpad2.i.i.i.i - unreachable - -lpad.body.i.i: ; preds = %lpad2.i.i.i.i - resume { i8*, i32 } %0 -} - -declare i32 @__gxx_personality_v0(...) - -declare void @_Znwm() - -!llvm.dbg.cu = !{!30} - -!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2) -!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32) -!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null) -!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++") -!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++") -!35 = !{i32 1, !"Debug Info Version", i32 3} -!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22) diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll deleted file mode 100644 index a717202d3574..000000000000 --- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test RegisterPressure handling of DBG_VALUE. -; -; CHECK: %entry -; CHECK: DEBUG_VALUE: test:callback -; CHECK: ret - -%struct.btCompoundLeafCallback = type { i32, i32 } - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define void @test() unnamed_addr uwtable ssp align 2 !dbg !2 { -entry: - %callback = alloca %struct.btCompoundLeafCallback, align 8 - br i1 undef, label %if.end, label %if.then - -if.then: ; preds = %entry - unreachable - -if.end: ; preds = %entry - call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !DIExpression()), !dbg !DILocation(scope: !2) - %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1 - store i32 0, i32* undef, align 8 - %cmp12447 = icmp sgt i32 undef, 0 - br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44 - -for.body.lr.ph: ; preds = %if.end - unreachable - -invoke.cont44: ; preds = %if.end - ret void -} - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!8} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6) -!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7) -!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4) -!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6) -!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet") -!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet") -!7 = !DISubroutineType(types: !9) -!8 = !{i32 1, !"Debug Info Version", i32 3} -!9 = !{null} diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll new file mode 100644 index 000000000000..553bc2789ff0 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +define i64 @test_add_i64(i64 %arg1, i64 %arg2) { +; ALL-LABEL: test_add_i64: +; ALL: # BB#0: +; ALL-NEXT: leaq (%rsi,%rdi), %rax +; ALL-NEXT: retq + %ret = add i64 %arg1, %arg2 + ret i64 %ret +} + +define i32 @test_add_i32(i32 %arg1, i32 %arg2) { +; ALL-LABEL: test_add_i32: +; ALL: # BB#0: +; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ALL-NEXT: leal (%rsi,%rdi), %eax +; ALL-NEXT: retq + %ret = add i32 %arg1, %arg2 + ret i32 %ret +} + +define i16 @test_add_i16(i16 %arg1, i16 %arg2) { +; ALL-LABEL: test_add_i16: +; ALL: # BB#0: +; ALL-NEXT: # kill: %DI<def> %DI<kill> %RDI<def> +; ALL-NEXT: # kill: %SI<def> %SI<kill> %RSI<def> +; ALL-NEXT: leal (%rsi,%rdi), %eax +; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ALL-NEXT: retq + %ret = add i16 %arg1, %arg2 + ret i16 %ret +} + +define i8 @test_add_i8(i8 %arg1, i8 %arg2) { +; ALL-LABEL: test_add_i8: +; ALL: # BB#0: +; ALL-NEXT: addb %dil, %sil +; ALL-NEXT: movl %esi, %eax +; ALL-NEXT: retq + %ret = add i8 %arg1, %arg2 + ret i8 %ret +} diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll index bf4c42cb4292..1aae1db8ab07 100644 --- a/test/CodeGen/X86/GlobalISel/binop.ll +++ b/test/CodeGen/X86/GlobalISel/binop.ll @@ -4,48 +4,6 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL -define i64 @test_add_i64(i64 %arg1, i64 %arg2) { -; ALL-LABEL: test_add_i64: -; ALL: # BB#0: -; ALL-NEXT: leaq (%rsi,%rdi), %rax -; ALL-NEXT: retq - %ret = add i64 %arg1, %arg2 - ret i64 %ret -} - -define i32 @test_add_i32(i32 %arg1, i32 %arg2) { -; ALL-LABEL: test_add_i32: -; ALL: # BB#0: -; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> -; ALL-NEXT: leal (%rsi,%rdi), %eax -; ALL-NEXT: retq - %ret = add i32 %arg1, %arg2 - ret i32 %ret -} - -define i16 @test_add_i16(i16 %arg1, i16 %arg2) { -; ALL-LABEL: test_add_i16: -; ALL: # BB#0: -; ALL-NEXT: # kill: %DI<def> %DI<kill> %RDI<def> -; ALL-NEXT: # kill: %SI<def> %SI<kill> %RSI<def> -; ALL-NEXT: leal (%rsi,%rdi), %eax -; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; ALL-NEXT: retq - %ret = add i16 %arg1, %arg2 - ret i16 %ret -} - -define i8 @test_add_i8(i8 %arg1, i8 %arg2) { -; ALL-LABEL: test_add_i8: -; ALL: # BB#0: -; ALL-NEXT: addb %dil, %sil -; ALL-NEXT: movl %esi, %eax -; ALL-NEXT: retq - %ret = add i8 %arg1, %arg2 - ret i8 %ret -} - define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { ; ALL-LABEL: test_sub_i64: ; ALL: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll new file mode 100644 index 000000000000..faa6a0350337 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/br.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 + +define void @uncondbr() { +; CHECK-LABEL: uncondbr: +; CHECK: # BB#1: # %entry +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_2: # %end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_3: # %bb2 +; CHECK-NEXT: jmp .LBB0_2 +entry: + br label %bb2 +end: + ret void +bb2: + br label %end +} + diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll new file mode 100644 index 000000000000..03692bb6b1de --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/cmp.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL + +define i32 @test_icmp_eq_i8(i8 %a, i8 %b) { +; ALL-LABEL: test_icmp_eq_i8: +; ALL: # BB#0: +; ALL-NEXT: cmpb %sil, %dil +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i16(i16 %a, i16 %b) { +; ALL-LABEL: test_icmp_eq_i16: +; ALL: # BB#0: +; ALL-NEXT: cmpw %si, %di +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i64(i64 %a, i64 %b) { +; ALL-LABEL: test_icmp_eq_i64: +; ALL: # BB#0: +; ALL-NEXT: cmpq %rsi, %rdi +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_eq_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ne_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ne_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setne %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ne i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ugt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: seta %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ugt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_uge_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_uge_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setae %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp uge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ult_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ult_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setb %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ule_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ule_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setbe %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ule i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sgt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setg %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sgt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sge_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sge_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setge %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_slt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_slt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setl %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp slt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sle_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sle_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setle %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sle i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll index c4d3566008b1..64cd0e70a4fd 100644 --- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll +++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll @@ -1,7 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64 -; TODO merge with ext.ll after i64 sext suported on 32bit platform +; TODO merge with ext.ll after i64 sext suported on 32bit platform + +define i64 @test_zext_i1(i8 %a) { +; X64-LABEL: test_zext_i1: +; X64: # BB#0: +; X64-NEXT: # kill: %DIL<def> %DIL<kill> %RDI<def> +; X64-NEXT: andq $1, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r +} define i64 @test_sext_i8(i8 %val) { ; X64-LABEL: test_sext_i8: diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll index 3c032686130e..4d4e3b05ca28 100644 --- a/test/CodeGen/X86/GlobalISel/ext.ll +++ b/test/CodeGen/X86/GlobalISel/ext.ll @@ -2,6 +2,24 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64 ; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X32 +define i32 @test_zext_i1(i32 %a) { +; X64-LABEL: test_zext_i1: +; X64: # BB#0: +; X64-NEXT: andl $1, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_zext_i1: +; X32: # BB#0: +; X32-NEXT: leal 4(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: retl + %val = trunc i32 %a to i1 + %r = zext i1 %val to i32 + ret i32 %r +} + define i32 @test_zext_i8(i8 %val) { ; X64-LABEL: test_zext_i8: ; X64: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir new file mode 100644 index 000000000000..68ccbbba0a73 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir @@ -0,0 +1,179 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s + +--- | + define i32 @test_cmp_i8(i8 %a, i8 %b) { + %r = icmp ult i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i16(i16 %a, i16 %b) { + %r = icmp ult i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i32(i32 %a, i32 %b) { + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i64(i64 %a, i64 %b) { + %r = icmp ult i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_p0(i32* %a, i32* %b) { + %r = icmp ult i32* %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + +... +--- +name: test_cmp_i8 +# CHECK-LABEL: name: test_cmp_i8 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s8) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s8), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s8), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i16 +# CHECK-LABEL: name: test_cmp_i16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s16) = COPY %edi +# CHECK-NEXT: %1(s16) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s16), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s16), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i32 +# CHECK-LABEL: name: test_cmp_i32 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s32) = COPY %edi +# CHECK-NEXT: %1(s32) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s32), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i64 +# CHECK-LABEL: name: test_cmp_i64 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s64) = COPY %rdi +# CHECK-NEXT: %1(s64) = COPY %rsi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s64), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(ult), %0(s64), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_p0 +# CHECK-LABEL: name: test_cmp_p0 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(p0) = COPY %rdi +# CHECK-NEXT: %1(p0) = COPY %rsi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(p0), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(p0) = COPY %rdi + %1(p0) = COPY %rsi + %2(s1) = G_ICMP intpred(ult), %0(p0), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir index 25af600f2299..6f051f1b6ea5 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --- | + define i64 @test_sext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = sext i1 %val to i64 + ret i64 %r + } + define i64 @test_sext_i8(i8 %val) { %r = sext i8 %val to i64 ret i64 %r @@ -16,6 +22,12 @@ ret i64 %r } + define i64 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r + } + define i64 @test_zext_i8(i8 %val) { %r = zext i8 %val to i64 ret i64 %r @@ -33,6 +45,32 @@ ... --- +name: test_sext_i1 +# CHECK-LABEL: name: test_sext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8) +# CHECK-NEXT: %2(s64) = G_SEXT %1(s1) +# CHECK-NEXT: %rax = COPY %2(s64) +# CHECK-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_SEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + +... +--- name: test_sext_i8 # CHECK-LABEL: name: test_sext_i8 alignment: 4 @@ -102,6 +140,32 @@ body: | ... --- +name: test_zext_i1 +# CHECK-LABEL: name: test_zext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8) +# CHECK-NEXT: %2(s64) = G_ZEXT %1(s1) +# CHECK-NEXT: %rax = COPY %2(s64) +# CHECK-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_ZEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + +... +--- name: test_zext_i8 # CHECK-LABEL: name: test_zext_i8 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir index 46457e0fff59..c9add0dc4e95 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i32 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i32 + ret i32 %r + } + define i32 @test_zext_i8(i8 %val) { %r = zext i8 %val to i32 ret i32 %r @@ -11,6 +17,12 @@ ret i32 %r } + define i32 @test_sext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = sext i1 %val to i32 + ret i32 %r + } + define i32 @test_sext_i8(i8 %val) { %r = sext i8 %val to i32 ret i32 %r @@ -23,6 +35,32 @@ ... --- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s1) = G_TRUNC %0(s8) +# ALL-NEXT: %2(s32) = G_ZEXT %1(s1) +# ALL-NEXT: %eax = COPY %2(s32) +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s32) = G_ZEXT %1(s1) + %eax = COPY %2(s32) + RET 0, implicit %eax + +... +--- name: test_zext_i8 # ALL-LABEL: name: test_zext_i8 alignment: 4 @@ -69,6 +107,32 @@ body: | ... --- +name: test_sext_i1 +# ALL-LABEL: name: test_sext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s1) = G_TRUNC %0(s8) +# ALL-NEXT: %2(s32) = G_SEXT %1(s1) +# ALL-NEXT: %eax = COPY %2(s32) +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s32) = G_SEXT %1(s1) + %eax = COPY %2(s32) + RET 0, implicit %eax + +... +--- name: test_sext_i8 # ALL-LABEL: name: test_sext_i8 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/memop-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll index 49a7fd79f8b2..49a7fd79f8b2 100644 --- a/test/CodeGen/X86/GlobalISel/memop-x32.ll +++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll index a7407c0e6b75..3e45a9c9a49d 100644 --- a/test/CodeGen/X86/GlobalISel/memop.ll +++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll @@ -1,13 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512F_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512VL_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY - +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY define i8 @test_load_i8(i8 * %p1) { ; ALL-LABEL: test_load_i8: @@ -77,34 +70,6 @@ define double @test_load_double(double * %p1) { ret double %r } -define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_noalign: -; SSE: # BB#0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_noalign: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovups (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r -} - -define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_align: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_align: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovaps (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r -} - define i32 * @test_store_i32(i32 %val, i32 * %p1) { ; ALL-LABEL: test_store_i32: ; ALL: # BB#0: @@ -139,19 +104,6 @@ define float * @test_store_float(float %val, float * %p1) { ; SSE_GREEDY-NEXT: movss %xmm0, (%rdi) ; SSE_GREEDY-NEXT: movq %rdi, %rax ; SSE_GREEDY-NEXT: retq -; -; ALL_AVX_FAST-LABEL: test_store_float: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovd %xmm0, %eax -; ALL_AVX_FAST-NEXT: movl %eax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_float: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovss %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq store float %val, float* %p1 ret float * %p1; } @@ -171,18 +123,6 @@ define double * @test_store_double(double %val, double * %p1) { ; SSE_GREEDY-NEXT: movq %rdi, %rax ; SSE_GREEDY-NEXT: retq ; -; ALL_AVX_FAST-LABEL: test_store_double: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovq %xmm0, %rax -; ALL_AVX_FAST-NEXT: movq %rax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_double: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovsd %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq store double %val, double* %p1 ret double * %p1; } diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll new file mode 100644 index 000000000000..e218fded4d5f --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX + +define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { +; ALL-LABEL: test_load_v4i32_noalign: +; ALL: # BB#0: +; ALL-NEXT: vmovups (%rdi), %xmm0 +; ALL-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r +} + +define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { +; ALL-LABEL: test_load_v4i32_align: +; ALL: # BB#0: +; ALL-NEXT: vmovaps (%rdi), %xmm0 +; ALL-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r +} + +define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { +; ALL-LABEL: test_store_v4i32_noalign: +; ALL: # BB#0: +; ALL-NEXT: vmovups %xmm0, (%rdi) +; ALL-NEXT: retq + store <4 x i32> %val, <4 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { +; ALL-LABEL: test_store_v4i32_align: +; ALL: # BB#0: +; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: retq + store <4 x i32> %val, <4 x i32>* %p1, align 16 + ret void +} diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir index 3a65a9003773..1ea922ee475a 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir @@ -2,11 +2,6 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY --- | - ; ModuleID = 'tmp.ll' - source_filename = "tmp.ll" - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - target triple = "x86_64--linux-gnu" - define i8 @test_add_i8(i8 %arg1, i8 %arg2) { %ret = add i8 %arg1, %arg2 ret i8 %ret @@ -120,6 +115,26 @@ ret void } + define i1 @test_icmp_eq_i8(i8 %a, i8 %b) { + %r = icmp eq i8 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i16(i16 %a, i16 %b) { + %r = icmp eq i16 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i32(i32 %a, i32 %b) { + %r = icmp eq i32 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i64(i64 %a, i64 %b) { + %r = icmp eq i64 %a, %b + ret i1 %r + } + ... --- name: test_add_i8 @@ -735,3 +750,103 @@ body: | RET 0 ... +--- +name: test_icmp_eq_i8 +# CHECK-LABEL: name: test_icmp_eq_i8 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s8), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i16 +# CHECK-LABEL: name: test_icmp_eq_i16 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s16), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i32 +# CHECK-LABEL: name: test_icmp_eq_i32 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s32), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i64 +# CHECK-LABEL: name: test_icmp_eq_i64 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(eq), %0(s64), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir new file mode 100644 index 000000000000..6d8cd2b1367d --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-br.mir @@ -0,0 +1,39 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +--- | + define void @uncondbr() { + entry: + br label %bb2 + + end: ; preds = %bb2 + ret void + + bb2: ; preds = %entry + br label %end + } + +... +--- +name: uncondbr +# CHECK-LABEL: name: uncondbr +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: JMP_1 %bb.2.bb2 +# CHECK: JMP_1 %bb.1.end +body: | + bb.1.entry: + successors: %bb.3.bb2(0x80000000) + + G_BR %bb.3.bb2 + + bb.2.end: + RET 0 + + bb.3.bb2: + successors: %bb.2.end(0x80000000) + + G_BR %bb.2.end + +... diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir new file mode 100644 index 000000000000..1d3da6cb88b9 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir @@ -0,0 +1,563 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK + +--- | + define i32 @test_icmp_eq_i8(i8 %a, i8 %b) { + %r = icmp eq i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i16(i16 %a, i16 %b) { + %r = icmp eq i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i64(i64 %a, i64 %b) { + %r = icmp eq i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i32(i32 %a, i32 %b) { + %r = icmp eq i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ne_i32(i32 %a, i32 %b) { + %r = icmp ne i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) { + %r = icmp ugt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_uge_i32(i32 %a, i32 %b) { + %r = icmp uge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ult_i32(i32 %a, i32 %b) { + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ule_i32(i32 %a, i32 %b) { + %r = icmp ule i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) { + %r = icmp sgt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sge_i32(i32 %a, i32 %b) { + %r = icmp sge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_slt_i32(i32 %a, i32 %b) { + %r = icmp slt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sle_i32(i32 %a, i32 %b) { + %r = icmp sle i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + +... +--- +name: test_icmp_eq_i8 +# CHECK-LABEL: name: test_icmp_eq_i8 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr8 } +# CHECK-NEXT: - { id: 1, class: gr8 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %dil +# CHECK-NEXT: %1 = COPY %sil +# CHECK-NEXT: CMP8rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s8), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i16 +# CHECK-LABEL: name: test_icmp_eq_i16 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr16 } +# CHECK-NEXT: - { id: 1, class: gr16 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %di +# CHECK-NEXT: %1 = COPY %si +# CHECK-NEXT: CMP16rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s16), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i64 +# CHECK-LABEL: name: test_icmp_eq_i64 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64 } +# CHECK-NEXT: - { id: 1, class: gr64 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %rdi +# CHECK-NEXT: %1 = COPY %rsi +# CHECK-NEXT: CMP64rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(eq), %0(s64), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i32 +# CHECK-LABEL: name: test_icmp_eq_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ne_i32 +# CHECK-LABEL: name: test_icmp_ne_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETNEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ne), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ugt_i32 +# CHECK-LABEL: name: test_icmp_ugt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETAr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ugt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_uge_i32 +# CHECK-LABEL: name: test_icmp_uge_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETAEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(uge), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ult_i32 +# CHECK-LABEL: name: test_icmp_ult_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETBr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ule_i32 +# CHECK-LABEL: name: test_icmp_ule_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETBEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ule), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sgt_i32 +# CHECK-LABEL: name: test_icmp_sgt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETGr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sgt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sge_i32 +# CHECK-LABEL: name: test_icmp_sge_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETGEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sge), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_slt_i32 +# CHECK-LABEL: name: test_icmp_slt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETLr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(slt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sle_i32 +# CHECK-LABEL: name: test_icmp_sle_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETLEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sle), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir index 85b3f61a9e44..0844701487bc 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i64 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r + } + define i64 @test_sext_i8(i8 %val) { %r = sext i8 %val to i64 ret i64 %r @@ -13,6 +19,38 @@ ... --- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8 } +# ALL-NEXT: - { id: 1, class: gr8 } +# ALL-NEXT: - { id: 2, class: gr64 } +# ALL-NEXT: - { id: 3, class: gr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %dil +# ALL-NEXT: %1 = COPY %0 +# ALL-NEXT: %3 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2 = AND64ri8 %3, 1, implicit-def %eflags +# ALL-NEXT: %rax = COPY %2 +# ALL-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_ZEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + +... +--- name: test_sext_i8 # ALL-LABEL: name: test_sext_i8 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir index 63aeae89bd1a..831d6efb75f1 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext.mir @@ -2,6 +2,11 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i32 @test_zext_i1(i1 %a) { + %r = zext i1 %a to i32 + ret i32 %r + } + define i32 @test_zext_i8(i8 %val) { %r = zext i8 %val to i32 ret i32 %r @@ -24,6 +29,34 @@ ... --- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %dil +# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %1 = AND32ri8 %2, 1, implicit-def %eflags +# ALL-NEXT: %eax = COPY %1 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s32) = G_ZEXT %0(s1) + %eax = COPY %1(s32) + RET 0, implicit %eax + +... +--- name: test_zext_i8 # ALL-LABEL: name: test_zext_i8 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir index 8e6a2771db6e..8e6a2771db6e 100644 --- a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir +++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir diff --git a/test/CodeGen/X86/GlobalISel/select-memop.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir index 817dc3cc9764..b57c9b0cca98 100644 --- a/test/CodeGen/X86/GlobalISel/select-memop.mir +++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir @@ -34,7 +34,6 @@ ret float %r } - define double @test_load_double(double* %p1) { %r = load double, double* %p1 ret double %r @@ -45,16 +44,6 @@ ret double %r } - define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r - } - - define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r - } - define i32* @test_store_i32(i32 %val, i32* %p1) { store i32 %val, i32* %p1 ret i32* %p1 @@ -85,16 +74,6 @@ ret double* %p1 } - define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 16 - ret <4 x i32>* %p1 - } - - define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 1 - ret <4 x i32>* %p1 - } - define i32* @test_load_ptr(i32** %ptr1) { %p = load i32*, i32** %ptr1 ret i32* %p @@ -304,62 +283,6 @@ body: | ... --- -# ALL-LABEL: name: test_load_v4i32_noalign -name: test_load_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_v4i32_align -name: test_load_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- # ALL-LABEL: name: test_store_i32 name: test_store_i32 alignment: 4 @@ -530,66 +453,6 @@ body: | ... --- -# ALL-LABEL: name: test_store_v4i32_align -name: test_store_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_v4i32_noalign -name: test_store_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- # ALL-LABEL: name: test_load_ptr name: test_load_ptr alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir new file mode 100644 index 000000000000..ce3f6b91dcf6 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir @@ -0,0 +1,143 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r + } + + define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r + } + + define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 16 + ret <4 x i32>* %p1 + } + + define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 1 + ret <4 x i32>* %p1 + } + +... +--- +# ALL-LABEL: name: test_load_v4i32_noalign +name: test_load_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_v4i32_align +name: test_load_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_store_v4i32_align +name: test_store_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_v4i32_noalign +name: test_store_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll new file mode 100644 index 000000000000..262cb96ca6d8 --- /dev/null +++ b/test/CodeGen/X86/O0-pipeline.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; CHECK-LABEL: Pass Arguments: +; CHECK-NEXT: Target Library Information +; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Target Pass Configuration +; CHECK-NEXT: Type-Based Alias Analysis +; CHECK-NEXT: Scoped NoAlias Alias Analysis +; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Machine Module Information +; CHECK-NEXT: Machine Branch Probability Analysis +; CHECK-NEXT: ModulePass Manager +; CHECK-NEXT: Pre-ISel Intrinsic Lowering +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Lower Garbage Collection Instructions +; CHECK-NEXT: Shadow Stack GC Lowering +; CHECK-NEXT: Remove unreachable blocks from the CFG +; CHECK-NEXT: Inserts calls to mcount-like functions +; CHECK-NEXT: Scalarize Masked Memory Intrinsics +; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Rewrite Symbols +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Safe Stack instrumentation pass +; CHECK-NEXT: Insert stack protectors +; CHECK-NEXT: Module Verifier +; CHECK-NEXT: X86 DAG->DAG Instruction Selection +; CHECK-NEXT: X86 PIC Global Base Reg Initialization +; CHECK-NEXT: Expand ISel Pseudo-instructions +; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: X86 WinAlloca Expander +; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Two-Address instruction pass +; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: Bundle Machine CFG Edges +; CHECK-NEXT: X86 FP Stackifier +; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Post-RA pseudo instruction expansion pass +; CHECK-NEXT: X86 pseudo instruction expansion pass +; CHECK-NEXT: Analyze Machine Code For Garbage Collection +; CHECK-NEXT: X86 vzeroupper inserter +; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: StackMap Liveness Analysis +; CHECK-NEXT: Live DEBUG_VALUE analysis +; CHECK-NEXT: Insert fentry calls +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Insert XRay ops +; CHECK-NEXT: Implement the 'patchable-function' attribute +; CHECK-NEXT: Lazy Machine Block Frequency Analysis +; CHECK-NEXT: Machine Optimization Remark Emitter +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: X86 Assembly Printer +; CHECK-NEXT: Free MachineFunction + +define void @f() { + ret void +} diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll index 35f488ea448c..d0160a5b84df 100644 --- a/test/CodeGen/X86/all-ones-vector.ll +++ b/test/CodeGen/X86/all-ones-vector.ll @@ -157,8 +157,8 @@ define <32 x i8> @allones_v32i8() nounwind { ; ; X32-AVX1-LABEL: allones_v32i8: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v32i8: @@ -174,8 +174,8 @@ define <32 x i8> @allones_v32i8() nounwind { ; ; X64-AVX1-LABEL: allones_v32i8: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v32i8: @@ -194,8 +194,8 @@ define <16 x i16> @allones_v16i16() nounwind { ; ; X32-AVX1-LABEL: allones_v16i16: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v16i16: @@ -211,8 +211,8 @@ define <16 x i16> @allones_v16i16() nounwind { ; ; X64-AVX1-LABEL: allones_v16i16: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v16i16: @@ -231,8 +231,8 @@ define <8 x i32> @allones_v8i32() nounwind { ; ; X32-AVX1-LABEL: allones_v8i32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8i32: @@ -248,8 +248,8 @@ define <8 x i32> @allones_v8i32() nounwind { ; ; X64-AVX1-LABEL: allones_v8i32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8i32: @@ -268,8 +268,8 @@ define <4 x i64> @allones_v4i64() nounwind { ; ; X32-AVX1-LABEL: allones_v4i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4i64: @@ -285,8 +285,8 @@ define <4 x i64> @allones_v4i64() nounwind { ; ; X64-AVX1-LABEL: allones_v4i64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4i64: @@ -305,8 +305,8 @@ define <4 x double> @allones_v4f64() nounwind { ; ; X32-AVX1-LABEL: allones_v4f64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4f64: @@ -322,8 +322,8 @@ define <4 x double> @allones_v4f64() nounwind { ; ; X64-AVX1-LABEL: allones_v4f64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4f64: @@ -342,8 +342,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize { ; ; X32-AVX1-LABEL: allones_v4f64_optsize: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4f64_optsize: @@ -359,8 +359,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize { ; ; X64-AVX1-LABEL: allones_v4f64_optsize: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4f64_optsize: @@ -379,8 +379,8 @@ define <8 x float> @allones_v8f32() nounwind { ; ; X32-AVX1-LABEL: allones_v8f32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8f32: @@ -396,8 +396,8 @@ define <8 x float> @allones_v8f32() nounwind { ; ; X64-AVX1-LABEL: allones_v8f32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8f32: @@ -416,8 +416,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize { ; ; X32-AVX1-LABEL: allones_v8f32_optsize: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8f32_optsize: @@ -433,8 +433,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize { ; ; X64-AVX1-LABEL: allones_v8f32_optsize: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8f32_optsize: @@ -455,8 +455,8 @@ define <64 x i8> @allones_v64i8() nounwind { ; ; X32-AVX1-LABEL: allones_v64i8: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -487,8 +487,8 @@ define <64 x i8> @allones_v64i8() nounwind { ; ; X64-AVX1-LABEL: allones_v64i8: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -522,8 +522,8 @@ define <32 x i16> @allones_v32i16() nounwind { ; ; X32-AVX1-LABEL: allones_v32i16: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -554,8 +554,8 @@ define <32 x i16> @allones_v32i16() nounwind { ; ; X64-AVX1-LABEL: allones_v32i16: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -589,8 +589,8 @@ define <16 x i32> @allones_v16i32() nounwind { ; ; X32-AVX1-LABEL: allones_v16i32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -615,8 +615,8 @@ define <16 x i32> @allones_v16i32() nounwind { ; ; X64-AVX1-LABEL: allones_v16i32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -644,8 +644,8 @@ define <8 x i64> @allones_v8i64() nounwind { ; ; X32-AVX1-LABEL: allones_v8i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -670,8 +670,8 @@ define <8 x i64> @allones_v8i64() nounwind { ; ; X64-AVX1-LABEL: allones_v8i64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -699,8 +699,8 @@ define <8 x double> @allones_v8f64() nounwind { ; ; X32-AVX1-LABEL: allones_v8f64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -725,8 +725,8 @@ define <8 x double> @allones_v8f64() nounwind { ; ; X64-AVX1-LABEL: allones_v8f64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -754,8 +754,8 @@ define <16 x float> @allones_v16f32() nounwind { ; ; X32-AVX1-LABEL: allones_v16f32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -780,8 +780,8 @@ define <16 x float> @allones_v16f32() nounwind { ; ; X64-AVX1-LABEL: allones_v16f32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 2aaf14001758..aa28ef5175ed 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index e6cc95fcdb23..6869d088e7cd 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -34,8 +34,8 @@ define void @zero256() nounwind ssp { define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind { ; CHECK-LABEL: ones: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ float>* %ptr2vec615, align 32 define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind { ; CHECK-LABEL: ones2: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll index 066719b3bfe8..231334ddcb85 100644 --- a/test/CodeGen/X86/avx-cvt-3.ll +++ b/test/CodeGen/X86/avx-cvt-3.ll @@ -48,16 +48,16 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_allbits_v8i32: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_allbits_v8i32: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq @@ -72,16 +72,16 @@ define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_shuffle_allbits_v8i32: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_shuffle_allbits_v8i32: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq @@ -95,8 +95,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86: # BB#0: ; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: movl $2, %eax @@ -111,8 +110,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X64: # BB#0: ; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: movl $2, %eax diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 1d925ff8e9bd..3cadbe2a8db3 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -99,16 +99,16 @@ define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind { ; X32-LABEL: test_mm256_andnot_pd: ; X32: # BB#0: -; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X32-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; X32-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 ; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_andnot_pd: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; X64-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq @@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_pd: @@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3 ; X32: # BB#0: ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_ps: @@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 052cacfea4dc..bb05481e313d 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -2837,4 +2837,54 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ret <8 x float> %8 } +define void @test_zeroall() { +; SANDY-LABEL: test_zeroall: +; SANDY: # BB#0: +; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_zeroall: +; HASWELL: # BB#0: +; HASWELL-NEXT: vzeroall # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_zeroall: +; BTVER2: # BB#0: +; BTVER2-NEXT: vzeroall # sched: [?:0.000000e+00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_zeroall: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.avx.vzeroall() + ret void +} +declare void @llvm.x86.avx.vzeroall() nounwind + +define void @test_zeroupper() { +; SANDY-LABEL: test_zeroupper: +; SANDY: # BB#0: +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_zeroupper: +; HASWELL: # BB#0: +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_zeroupper: +; BTVER2: # BB#0: +; BTVER2-NEXT: vzeroupper # sched: [?:0.000000e+00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_zeroupper: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.avx.vzeroupper() + ret void +} +declare void @llvm.x86.avx.vzeroupper() nounwind + !0 = !{i32 1} diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 341dd867e4ff..647b7a8f4dfc 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps -; CHECK: vaddps ; CHECK-NEXT: ret %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index 63b0281a7339..e29cf09718ad 100644 --- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 4890afec2164..c03623a2f035 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 0e7a8d25c56f..56962ca2671d 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index cc5e9e038e0b..f800d01064ba 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -274,11 +274,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> % ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) @@ -301,11 +301,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) @@ -477,11 +477,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -496,11 +496,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -515,11 +515,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -534,11 +534,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -553,11 +553,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -572,11 +572,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -591,11 +591,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -610,11 +610,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -683,8 +683,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -3656,11 +3657,11 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3684,10 +3685,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3903,11 +3904,11 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) @@ -3928,11 +3929,11 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) @@ -4434,8 +4435,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4454,8 +4455,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4556,9 +4557,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4579,9 +4580,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4603,9 +4604,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4650,9 +4651,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) @@ -4721,9 +4722,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovapd %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4821,12 +4822,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovapd %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4849,12 +4850,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4909,12 +4910,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4937,12 +4938,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5069,12 +5070,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5097,12 +5098,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5125,12 +5126,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5153,12 +5154,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll index 4ef88ac495c3..96aefdb10584 100644 --- a/test/CodeGen/X86/avx512-mask-spills.ll +++ b/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll new file mode 100644 index 000000000000..47c6813fa8dc --- /dev/null +++ b/test/CodeGen/X86/avx512-scalar_mask.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should just return %xmm0 here. +define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should zero the lower element of xmm0 and return it. +define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should just return %xmm0 here. +define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should zero the lower element of xmm0 and return it. +define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll new file mode 100644 index 000000000000..1940864824ff --- /dev/null +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL + +target triple = "x86_64-unknown-unknown" + +define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0 +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq +entry: + %m.trunc = trunc <8 x i64> %m to <8 x i1> + %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b + ret <8 x i64> %ret +} + +; This is a very contrived test case to trick the legalizer into splitting the +; v16i1 masks in the select during type legalization, and in so doing extend them +; into two v8i64 types. This lets us ensure that the lowering code can handle +; both formulations of vselect. All of this trickery is because we can't +; directly form an SDAG input to the lowering. +define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) { +; CHECK-SKX-LABEL: test2: +; CHECK-SKX: # BB#0: # %entry +; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6 +; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-SKX-NEXT: korw %k1, %k0, %k0 +; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1 +; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0 +; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test2: +; CHECK-KNL: # BB#0: # %entry +; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6 +; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-KNL-NEXT: korw %k1, %k0, %k1 +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq +entry: + %gt.m = fcmp ogt <16 x float> %x, zeroinitializer + %lt.m = fcmp olt <16 x float> %y, zeroinitializer + %m.or = or <16 x i1> %gt.m, %lt.m + %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b + ret <16 x double> %ret +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9b4e73a18fc2..faa055dfbbf3 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 3337f42eb142..13b850ccc3b6 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 7df07b0413ed..571f345d4616 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index 8f528394f5bd..f8f47c87100a 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 37aea45e6107..96254f7c95b0 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index c5478dad4224..1377733739fe 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -40,8 +40,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 000390404b54..97ac0fde10ec 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -414,8 +414,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -434,8 +434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 52a84deebf51..595b3e0ebb86 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index ad9ea93c2031..1bfdfd0e634d 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll index ca130bd2b676..b8531e25bfa1 100644 --- a/test/CodeGen/X86/avx512er-intrinsics.ll +++ b/test/CodeGen/X86/avx512er-intrinsics.ll @@ -118,78 +118,78 @@ define <4 x float> @test_rcp28_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone -define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { +define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_ss_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ; ret <4 x float> %res } -define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { +define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_ss_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ; ret <4 x float> %res } -define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { +define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ; ret <2 x double> %res } -define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) { +define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ; ret <2 x double> %res } declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone -define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { +define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01] +; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] %mem = load double , double * %ptr, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; ret <2 x double> %res } -define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { +define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01] +; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] ; CHECK-NEXT: retq # encoding: [0xc3] %ptr1 = getelementptr double, double* %ptr, i32 18 %mem = load double , double * %ptr1, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; ret <2 x double> %res } diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 30ecc0d2e49e..9659dc6d455a 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 3ca686cef3bf..b2fe6eba88ab 100644 --- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4d906a4fd29a..c2d8df6476b3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 1f324d679564..684b0468cf51 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll index afeba4ef2d99..94e2ee7a0aa9 100644 --- a/test/CodeGen/X86/bmi.ll +++ b/test/CodeGen/X86/bmi.ll @@ -454,6 +454,30 @@ entry: ret i32 %and } +define i32 @bzhi32d(i32 %a, i32 %b) { +; CHECK-LABEL: bzhi32d: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +entry: + %sub = sub i32 32, %b + %shr = lshr i32 -1, %sub + %and = and i32 %shr, %a + ret i32 %and +} + +define i32 @bzhi32e(i32 %a, i32 %b) { +; CHECK-LABEL: bzhi32e: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +entry: + %sub = sub i32 32, %b + %shl = shl i32 %a, %sub + %shr = lshr i32 %shl, %sub + ret i32 %shr +} + define i64 @bzhi64b(i64 %x, i8 zeroext %index) { ; CHECK-LABEL: bzhi64b: ; CHECK: # BB#0: # %entry @@ -468,6 +492,58 @@ entry: ret i64 %and } +define i64 @bzhi64c(i64 %a, i64 %b) { +; CHECK-LABEL: bzhi64c: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i64 64, %b + %shr = lshr i64 -1, %sub + %and = and i64 %shr, %a + ret i64 %and +} + +define i64 @bzhi64d(i64 %a, i32 %b) { +; CHECK-LABEL: bzhi64d: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i32 64, %b + %sh_prom = zext i32 %sub to i64 + %shr = lshr i64 -1, %sh_prom + %and = and i64 %shr, %a + ret i64 %and +} + +define i64 @bzhi64e(i64 %a, i64 %b) { +; CHECK-LABEL: bzhi64e: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i64 64, %b + %shl = shl i64 %a, %sub + %shr = lshr i64 %shl, %sub + ret i64 %shr +} + +define i64 @bzhi64f(i64 %a, i32 %b) { +; CHECK-LABEL: bzhi64f: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i32 64, %b + %sh_prom = zext i32 %sub to i64 + %shl = shl i64 %a, %sh_prom + %shr = lshr i64 %shl, %sh_prom + ret i64 %shr +} + define i64 @bzhi64_constant_mask(i64 %x) { ; CHECK-LABEL: bzhi64_constant_mask: ; CHECK: # BB#0: # %entry diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll index a9c74df9d0d9..1340b7662a7a 100644 --- a/test/CodeGen/X86/bswap_tree2.ll +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index 1e44aec99fc5..83ab2fac2f16 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -200,32 +200,29 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; SSE41: # BB#0: ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm1, %xmm5 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: pshufb %xmm1, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pandn %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm5 +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc: ; AVX1: # BB#0: ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -233,13 +230,11 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cmp = icmp eq <8 x i16> %a, %b diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll index 887abe99f6ed..37beb438d737 100644 --- a/test/CodeGen/X86/combine-abs.ll +++ b/test/CodeGen/X86/combine-abs.ll @@ -50,12 +50,11 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) { define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) { ; AVX2-LABEL: combine_v4i64_abs_abs: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll index 3ad38f2717d9..3dbff2680c22 100644 --- a/test/CodeGen/X86/combine-shl.ll +++ b/test/CodeGen/X86/combine-shl.ll @@ -11,8 +11,7 @@ define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_zero: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> zeroinitializer, %x ret <4 x i32> %1 diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 706e89051a3d..21564cdd7353 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -6,30 +6,12 @@ define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { ; SSE-LABEL: combine_vec_lshr_zero: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psrld %xmm0, %xmm3 -; SSE-NEXT: psrld %xmm2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_zero: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlvd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> zeroinitializer, %x ret <4 x i32> %1 diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll index 7b82125dc372..2f3c343afac0 100644 --- a/test/CodeGen/X86/constructor.ll +++ b/test/CodeGen/X86/constructor.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s ; RUN: llc -mtriple x86_64-unknown-freebsd < %s | FileCheck --check-prefix=INIT-ARRAY %s ; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s +; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS +; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY @llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }] @v = weak_odr global i8 0 @@ -37,3 +39,6 @@ entry: ; NACL-NEXT: .section .init_array,"aw",@init_array ; NACL-NEXT: .p2align 2 ; NACL-NEXT: .long f + +; MCU-CTORS: .section .ctors,"aw",@progbits +; MCU-INIT-ARRAY: .section .init_array,"aw",@init_array diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll index fb0da1b50d11..893ca93a9944 100644 --- a/test/CodeGen/X86/dbg-baseptr.ll +++ b/test/CodeGen/X86/dbg-baseptr.ll @@ -1,4 +1,5 @@ ; RUN: llc -o - %s | FileCheck %s +; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF ; This test checks that parameters on the stack pointer are correctly ; referenced by debug info. target triple = "x86_64--" @@ -7,24 +8,54 @@ target triple = "x86_64--" @ptr = external global i32* %struct.s = type { i32, i32, i32, i32, i32 } +; Simple case: no FP, use offset from RSP. + ; CHECK-LABEL: f0: -; CHECK: DEBUG_VALUE: f:input <- [%RSP+8] +; CHECK-NOT: pushq +; CHECK: movl $42, %eax +; CHECK: retq define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 { call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18 - ret i32 42 + ret i32 42, !dbg !18 } +; DWARF-LABEL: .debug_info contents: + +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 57 ) +; 0x57 -> RSP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f0") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 08 ) +; DW_OP_fbreg (0x91) 0x08 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + + +; Dynamic alloca forces the use of RBP as the base pointer + ; CHECK-LABEL: f1: -; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] +; CHECK: pushq %rbp +; CHECK: movl $42, %eax +; CHECK: popq %rbp +; CHECK: retq define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 { %val = load i64, i64* @glob ; this alloca should force FP usage. %stackspace = alloca i32, i64 %val, align 1 store i32* %stackspace, i32** @ptr call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !20, metadata !17), !dbg !21 - ret i32 42 + ret i32 42, !dbg !21 } +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 ) +; 0x56 -> RBP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f1") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 ) +; DW_OP_fbreg (0x91) 0x10 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + ; CHECK-LABEL: f2: ; Just check that we are indeed aligning the stack and setting up a base pointer ; in RBX. @@ -34,17 +65,24 @@ define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 { ; CHECK: andq $-64, %rsp ; CHECK: subq $64, %rsp ; CHECK: movq %rsp, %rbx -; The parameter should still be referenced through RBP though. -; CHECK-NOT: DEBUG_VALUE: f:input <- [%RBX -; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] define i32 @f2(%struct.s* byval align 8 %input) !dbg !22 { %val = load i64, i64* @glob %stackspace = alloca i32, i64 %val, align 64 store i32* %stackspace, i32** @ptr call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !23, metadata !17), !dbg !24 - ret i32 42 + ret i32 42, !dbg !24 } +; "input" should still be referred to through RBP. +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 ) +; 0x56 -> RBP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f2") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 ) +; DW_OP_fbreg (0x91) 0x10 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!2} @@ -52,7 +90,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !0 = !{i32 2, !"Dwarf Version", i32 4} !1 = !{i32 2, !"Debug Info Version", i32 3} -!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug) !3 = !DIFile(filename: "dbg-baseptr.ll", directory: "/") !4 = !DILocalVariable(name: "input", arg: 1, scope: !8, file: !3, line: 5, type: !9) !5 = !{} @@ -60,7 +98,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !6 = !DISubroutineType(types: !7) !7 = !{!10, !9} -!8 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!8 = distinct !DISubprogram(name: "f0", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, unit: !2, variables: !5) !9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", elements: !11) !10 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) @@ -74,9 +112,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !17 = !DIExpression() !18 = !DILocation(line: 5, scope: !8) -!19 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!19 = distinct !DISubprogram(name: "f1", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) !20 = !DILocalVariable(name: "input", arg: 1, scope: !19, file: !3, line: 5, type: !9) !21 = !DILocation(line: 5, scope: !19) -!22 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!22 = distinct !DISubprogram(name: "f2", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) !23 = !DILocalVariable(name: "input", arg: 1, scope: !22, file: !3, line: 5, type: !9) !24 = !DILocation(line: 5, scope: !22) diff --git a/test/CodeGen/X86/elf-associated.ll b/test/CodeGen/X86/elf-associated.ll index 361cf66cce72..7d58c3437025 100644 --- a/test/CodeGen/X86/elf-associated.ll +++ b/test/CodeGen/X86/elf-associated.ll @@ -37,3 +37,8 @@ @l = global i32 1, section "ccc", !associated !5 !5 = !{i32* null} ; CHECK-DAG: .section ccc,"aw",@progbits + +; Null metadata. +@m = global i32 1, section "ddd", !associated !6 +!6 = distinct !{null} +; CHECK-DAG: .section ddd,"aw",@progbits diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index d68236e9d250..eb06eb75a4d7 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 98082ec611d4..6c6bc8bdc1d1 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll index 4596b83f7bc2..b5507523a75a 100644 --- a/test/CodeGen/X86/haddsub-2.ll +++ b/test/CodeGen/X86/haddsub-2.ll @@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 %vecext1 = extractelement <4 x float> %A, i32 3 diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll index cea9ac26edbc..ec620b8ce877 100644 --- a/test/CodeGen/X86/known-signbits-vector.ll +++ b/test/CodeGen/X86/known-signbits-vector.ll @@ -137,3 +137,64 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin %6 = sitofp i64 %5 to float ret float %6 } + +define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind { +; X32-LABEL: signbits_sext_shuffle_sitofp: +; X32: # BB#0: +; X32-NEXT: vpmovsxdq %xmm0, %xmm1 +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovsxdq %xmm0, %xmm0 +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-NEXT: vcvtdq2pd %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: signbits_sext_shuffle_sitofp: +; X64: # BB#0: +; X64-NEXT: vpmovsxdq %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: vcvtdq2pd %xmm0, %ymm0 +; X64-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + %3 = sitofp <4 x i64> %2 to <4 x double> + ret <4 x double> %3 +} + +define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { +; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X32: # BB#0: +; X32-NEXT: vpsrad $16, %xmm0, %xmm1 +; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X64: # BB#0: +; X64-NEXT: vpsrad $16, %xmm0, %xmm1 +; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X64-NEXT: retq + %1 = ashr <2 x i64> %a0, <i64 16, i64 16> + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %4 = ashr <4 x i64> %3, <i64 16, i64 16, i64 16, i64 16> + %5 = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> + %6 = sitofp <2 x i64> %5 to <2 x double> + ret <2 x double> %6 +} diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir new file mode 100644 index 000000000000..70aac21c7ff2 --- /dev/null +++ b/test/CodeGen/X86/leaFixup32.mir @@ -0,0 +1,508 @@ +# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s +--- | + ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll' + source_filename = "test/CodeGen/X86/fixup-lea.ll" + target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + target triple = "i386" + ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir + + ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions + ; where ADD32ri8 is chosen + define i32 @test2add_32() { + ret i32 0 + } + + ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register + define i32 @test2add_ebp_32() { + ret i32 0 + } + + ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced + ; with an add instruction + define i32 @test1add_ebp_32() { + ret i32 0 + } + + ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_32() { + ret i32 0 + } + + ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base is ebp register + define i32 @testleaadd_ebp_32() { + ret i32 0 + } + + ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_ebp_32() { + ret i32 0 + } + + ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32 + ; is chosen + define i32 @test2addi32_32() { + ret i32 0 + } + + ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_ebp_32() { + ret i32 0 + } + + ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_ebp_index_32() { + ret i32 0 + } + + ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_ebp_index2_32() { + ret i32 0 + } + + ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions + define i32 @test_skip_opt_32() { + ret i32 0 + } + + ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_32() { + ret i32 0 + } + +... +--- +name: test2add_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %eax = ADD32rr %eax, killed %ebp + ; CHECK: %eax = ADD32ri8 %eax, -5 + + %eax = LEA32r killed %eax, 1, killed %ebp, -5, _ + RETQ %eax + +... +--- +name: test2add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebp = ADD32rr %ebp, killed %eax + ; CHECK: %ebp = ADD32ri8 %ebp, -5 + + %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _ + RETQ %ebp + +... +--- +name: test1add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebp = ADD32rr %ebp, killed %eax + + %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _ + RETQ %ebp + +... +--- +name: testleaadd_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %esi + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0 + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _ + + %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _ + RETQ %ebx + +... +--- +name: test2addi32_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %eax = ADD32rr %eax, killed %ebp + ; CHECK: %eax = ADD32ri %eax, 129 + + %eax = LEA32r killed %eax, 1, killed %ebp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = MOV32rr killed %ebp + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_index_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _ + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_index2_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _ + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _ + RETQ %ebx + +... +--- +name: test_skip_opt_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _ + + %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebp' } + - { reg: '%eax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _ + ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _ + ; CHECK: %ebp = ADD32ri8 %ebp, 5 + + CMP32rr %eax, killed %ebx, implicit-def %eflags + %ebx = LEA32r killed %eax, 4, killed %eax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %eax, %ebp, %ebx + %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _ + RETQ %ebp + +... + + + diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir new file mode 100644 index 000000000000..9b0058750598 --- /dev/null +++ b/test/CodeGen/X86/leaFixup64.mir @@ -0,0 +1,1041 @@ +# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s +--- | + ; ModuleID = 'lea-2.ll' + source_filename = "lea-2.ll" + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir + + ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions + ; but can be replaced with 1 lea + 1 add + define i32 @testleaadd_64_32_1() { + ret i32 0 + } + + ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add + define i32 @testleaadd_rbp_64_32_1() { + ret i32 0 + } + + ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not + ; be replaced with an add instruction but can be replaced with 1 lea instruction + define i32 @test1lea_rbp_64_32_1() { + ret i32 0 + } + + ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions + define i32 @test2add_64() { + ret i32 0 + } + + ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register + define i32 @test2add_rbp_64() { + ret i32 0 + } + + ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced + ; with an add instruction + define i32 @test1add_rbp_64() { + ret i32 0 + } + + ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions + ; where the base is rbp/r13/ebp register + define i32 @testleaadd_rbp_64_32() { + ret i32 0 + } + + ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_rbp_64_32() { + ret i32 0 + } + + ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_64() { + ret i32 0 + } + + ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base is rbp/r13/ebp register + define i32 @testleaadd_rbp_64() { + ret i32 0 + } + + ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_rbp_64() { + ret i32 0 + } + + ;test8: dst = base & scale!=1, can't optimize + define i32 @test8() { + ret i32 0 + } + + ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where + ; ADD64ri32 is chosen + define i32 @testleaaddi32_64_32() { + ret i32 0 + } + + ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_rbp_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_rbp_index_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_rbp_index2_64_32() { + ret i32 0 + } + + ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32 + ; is chosen + define i32 @test2addi32_64() { + ret i32 0 + } + + ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_rbp_64() { + ret i32 0 + } + + ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_rbp_index_64() { + ret i32 0 + } + + ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_rbp_index2_64() { + ret i32 0 + } + + ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions + define i32 @test_skip_opt_64() { + ret i32 0 + } + + ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_64() { + ret i32 0 + } + + ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions + define i32 @test_skip_opt_64_32() { + ret i32 0 + } + + ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_64_32() { + ret i32 0 + } + + +... +--- +name: testleaadd_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %eax = ADD32ri8 %eax, -5 + + %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _ + RETQ %eax + +... +--- +name: testleaadd_rbp_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %ebp = ADD32ri8 %ebp, -5 + + %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebp + +... +--- +name: test1lea_rbp_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0 + + %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebp + +... +--- +name: test2add_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rax = ADD64rr %rax, killed %rbp + ; CHECK: %rax = ADD64ri8 %rax, -5 + + %rax = LEA64r killed %rax, 1, killed %rbp, -5, _ + RETQ %eax + +... +--- +name: test2add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbp = ADD64rr %rbp, killed %rax + ; CHECK: %rbp = ADD64ri8 %rbp, -5 + + %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebp + +... +--- +name: test1add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbp = ADD64rr %rbp, killed %rax + + %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebp + +... +--- +name: testleaadd_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %rbx = ADD64ri8 %rbx, -5 + + %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %rbx = ADD64ri8 %rbx, -5 + + %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + + %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebx + +... +--- +name: test8 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rdi' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rdi, %rbp + ; CHECK: %r12 = LEA64r _, 2, killed %r13, 5, _ + ; CHECK: %r12 = ADD64rr %r12, killed %rbp + %rbp = KILL %rbp, implicit-def %rbp + %r13 = KILL %rdi, implicit-def %r13 + %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _ + RETQ %r12 + +... +--- +name: testleaaddi32_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %eax = ADD32ri %eax, 129 + + %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index2_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _ + + %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: test2addi32_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rax = ADD64rr %rax, killed %rbp + ; CHECK: %rax = ADD64ri32 %rax, 129 + + %rax = LEA64r killed %rax, 1, killed %rbp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = MOV64rr killed %rbp + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _ + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index2_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _ + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: test_skip_opt_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _ + + %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbp' } + - { reg: '%rax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _ + ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _ + ; CHECK: %rbp = ADD64ri8 %rbp, 5 + + CMP64rr %rax, killed %rbx, implicit-def %eflags + %rbx = LEA64r killed %rax, 4, killed %rax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %rax, %rbp, %rbx + %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _ + RETQ %ebp + +... +--- +name: test_skip_opt_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _ + + %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbp' } + - { reg: '%rax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _ + ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _ + ; CHECK: %ebp = ADD32ri8 %ebp, 5 + + CMP64rr %rax, killed %rbx, implicit-def %eflags + %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %rax, %rbp, %rbx + %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _ + RETQ %ebp + +... + + + diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll new file mode 100644 index 000000000000..a9cf086dbd90 --- /dev/null +++ b/test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { +entry: + br i1 %a, label %then, label %else + +then: + br label %else + +else: + %0 = phi i64 [ 4, %entry ], [ 10, %then ] + %r = phi i64 [ %r1, %entry ], [ %r2, %then ] + %s = phi i64 [ %s1, %entry ], [ %s2, %then ] + %t = phi i64 [ %t1, %entry ], [ %t2, %then ] +; CHECK-LABEL: test: +; CHECK: add +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %0, %r + tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %8 = add nuw nsw i64 %2, %7 + %9 = add nuw nsw i64 %4, %8 + %10 = add nuw nsw i64 %6, %9 + %11 = add nuw nsw i64 %s, %t + tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %12 = add nuw nsw i64 %10, %11 + ret i64 %12 +} + +declare i32 @_Z3foov() +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!1, !2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DIFile(filename: "a.c", directory: "./") +!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) +!5 = !DILocalVariable(name: "x", scope: !4) +!6 = !DILocation(line: 4, scope: !4) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index d332b2f3169f..af86df510016 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 29a662fb217e..c5de8dd96cbc 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -3,7 +3,7 @@ ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32 -; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll index 71417694b0d4..2f7714e63886 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: @@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: @@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index e62a1d04dad6..94bbe75702cb 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll index dfce6c681500..83b2be83d552 100644 --- a/test/CodeGen/X86/not-and-simplify.ll +++ b/test/CodeGen/X86/not-and-simplify.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefix=ALL --check-prefix=NO_BMI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=ALL --check-prefix=BMI @@ -11,13 +12,24 @@ define i32 @shrink_xor_constant1(i32 %x) { ; ALL-NEXT: xorl $1, %edi ; ALL-NEXT: movl %edi, %eax ; ALL-NEXT: retq -; %sh = lshr i32 %x, 31 %not = xor i32 %sh, -1 %and = and i32 %not, 1 ret i32 %and } +define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) { +; ALL-LABEL: shrink_xor_constant1_splat: +; ALL: # BB#0: +; ALL-NEXT: psrld $31, %xmm0 +; ALL-NEXT: pandn {{.*}}(%rip), %xmm0 +; ALL-NEXT: retq + %sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> + %not = xor <4 x i32> %sh, <i32 -1, i32 -1, i32 -1, i32 -1> + %and = and <4 x i32> %not, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %and +} + ; Clear low bits via shift, set them with xor (not), then mask them off. define i8 @shrink_xor_constant2(i8 %x) { @@ -27,10 +39,22 @@ define i8 @shrink_xor_constant2(i8 %x) { ; ALL-NEXT: xorb $-32, %dil ; ALL-NEXT: movl %edi, %eax ; ALL-NEXT: retq -; %sh = shl i8 %x, 5 %not = xor i8 %sh, -1 %and = and i8 %not, 224 ; 0xE0 ret i8 %and } +define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) { +; ALL-LABEL: shrink_xor_constant2_splat: +; ALL: # BB#0: +; ALL-NEXT: psllw $5, %xmm0 +; ALL-NEXT: pand {{.*}}(%rip), %xmm0 +; ALL-NEXT: pandn {{.*}}(%rip), %xmm0 +; ALL-NEXT: retq + %sh = shl <16 x i8> %x, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5> + %not = xor <16 x i8> %sh, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %and = and <16 x i8> %not, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %and +} + diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index d26cf02dd942..0bda41a30c69 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll index 5cd649bb3902..24db6ba9ca2f 100644 --- a/test/CodeGen/X86/packss.ll +++ b/test/CodeGen/X86/packss.ll @@ -26,18 +26,17 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; X64-AVX1-LABEL: trunc_ashr_v4i64: ; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: trunc_ashr_v4i64: ; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; X64-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 88cb7a6d5825..50a661fcca11 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/pr28129.ll b/test/CodeGen/X86/pr28129.ll index a155f71f79c3..15bffffa207f 100644 --- a/test/CodeGen/X86/pr28129.ll +++ b/test/CodeGen/X86/pr28129.ll @@ -5,15 +5,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) { ; X86-LABEL: cmp4f64_domain: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp4f64_domain: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer @@ -26,15 +26,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) { define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize { ; X86-LABEL: cmp4f64_domain_optsize: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp4f64_domain_optsize: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer @@ -47,15 +47,15 @@ define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize { define <8 x float> @cmp8f32_domain(<8 x float> %a) { ; X86-LABEL: cmp8f32_domain: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp8f32_domain: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer @@ -68,15 +68,15 @@ define <8 x float> @cmp8f32_domain(<8 x float> %a) { define <8 x float> @cmp8f32_domain_optsize(<8 x float> %a) optsize { ; X86-LABEL: cmp8f32_domain_optsize: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp8f32_domain_optsize: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index 8c970b3d4771..94904018872b 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -38,7 +38,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] @@ -52,10 +53,9 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vmovaps %xmm15, %xmm1 ; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm9 -; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) diff --git a/test/CodeGen/X86/pr30562.ll b/test/CodeGen/X86/pr30562.ll index dda736a1a183..a8e648074194 100644 --- a/test/CodeGen/X86/pr30562.ll +++ b/test/CodeGen/X86/pr30562.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + define i32 @foo(i64* nocapture %perm, i32 %n) { entry: br label %body diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll index 0dd8eb0ece85..d7a546c7396d 100644 --- a/test/CodeGen/X86/pr31088.ll +++ b/test/CodeGen/X86/pr31088.ll @@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 ; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index e05fc926b080..143e3af82eb7 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -30,25 +30,24 @@ define void @foo() { ; X86-O0-NEXT: subl $12, %esp ; X86-O0-NEXT: .Lcfi0: ; X86-O0-NEXT: .cfi_def_cfa_offset 16 -; X86-O0-NEXT: movzbl c, %eax -; X86-O0-NEXT: testl %eax, %eax -; X86-O0-NEXT: setne %cl -; X86-O0-NEXT: movl %eax, %edx -; X86-O0-NEXT: movb %dl, %ch -; X86-O0-NEXT: testb %ch, %ch +; X86-O0-NEXT: movb c, %al +; X86-O0-NEXT: testb %al, %al ; X86-O0-NEXT: setne {{[0-9]+}}(%esp) -; X86-O0-NEXT: movzbl %cl, %edx -; X86-O0-NEXT: subl %eax, %edx -; X86-O0-NEXT: setle %cl -; X86-O0-NEXT: # implicit-def: %EAX -; X86-O0-NEXT: movb %cl, %al -; X86-O0-NEXT: andl $1, %eax -; X86-O0-NEXT: kmovd %eax, %k0 -; X86-O0-NEXT: kmovd %k0, %eax +; X86-O0-NEXT: movzbl c, %ecx +; X86-O0-NEXT: testl %ecx, %ecx +; X86-O0-NEXT: setne %al +; X86-O0-NEXT: movzbl %al, %edx +; X86-O0-NEXT: subl %ecx, %edx +; X86-O0-NEXT: setle %al +; X86-O0-NEXT: # implicit-def: %ECX ; X86-O0-NEXT: movb %al, %cl -; X86-O0-NEXT: andb $1, %cl -; X86-O0-NEXT: movzbl %cl, %eax -; X86-O0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-O0-NEXT: andl $1, %ecx +; X86-O0-NEXT: kmovd %ecx, %k0 +; X86-O0-NEXT: kmovd %k0, %ecx +; X86-O0-NEXT: movb %cl, %al +; X86-O0-NEXT: andb $1, %al +; X86-O0-NEXT: movzbl %al, %ecx +; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-O0-NEXT: addl $12, %esp ; X86-O0-NEXT: retl @@ -69,27 +68,25 @@ define void @foo() { ; ; X64-O0-LABEL: foo: ; X64-O0: # BB#0: # %entry -; X64-O0-NEXT: movzbl {{.*}}(%rip), %eax -; X64-O0-NEXT: movl %eax, %ecx -; X64-O0-NEXT: movb %cl, %dl -; X64-O0-NEXT: movl %ecx, %eax -; X64-O0-NEXT: testq %rcx, %rcx -; X64-O0-NEXT: setne %sil -; X64-O0-NEXT: testb %dl, %dl +; X64-O0-NEXT: movb {{.*}}(%rip), %al +; X64-O0-NEXT: testb %al, %al ; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movzbl %sil, %edi -; X64-O0-NEXT: subl %eax, %edi -; X64-O0-NEXT: setle %dl -; X64-O0-NEXT: # implicit-def: %EAX -; X64-O0-NEXT: movb %dl, %al -; X64-O0-NEXT: andl $1, %eax -; X64-O0-NEXT: kmovd %eax, %k0 -; X64-O0-NEXT: kmovd %k0, %eax -; X64-O0-NEXT: movb %al, %dl -; X64-O0-NEXT: andb $1, %dl -; X64-O0-NEXT: movzbl %dl, %eax -; X64-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx +; X64-O0-NEXT: testl %ecx, %ecx +; X64-O0-NEXT: setne %al +; X64-O0-NEXT: movzbl %al, %edx +; X64-O0-NEXT: subl %ecx, %edx +; X64-O0-NEXT: setle %al +; X64-O0-NEXT: # implicit-def: %ECX +; X64-O0-NEXT: movb %al, %cl +; X64-O0-NEXT: andl $1, %ecx +; X64-O0-NEXT: kmovd %ecx, %k0 +; X64-O0-NEXT: kmovd %k0, %ecx +; X64-O0-NEXT: movb %cl, %al +; X64-O0-NEXT: andb $1, %al +; X64-O0-NEXT: movzbl %al, %ecx +; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill ; X64-O0-NEXT: retq entry: %a = alloca i8, align 1 diff --git a/test/CodeGen/X86/pr32907.ll b/test/CodeGen/X86/pr32907.ll index bc03fbe06843..8057b31c961c 100644 --- a/test/CodeGen/X86/pr32907.ll +++ b/test/CodeGen/X86/pr32907.ll @@ -5,41 +5,44 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) { -; SSE-LABEL: PR32907: -; SSE: # BB#0: # %entry -; SSE-NEXT: psubq %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psubq %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: PR32907: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubq %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: PR32907: +; SSE42: # BB#0: # %entry +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX2-LABEL: PR32907: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32907: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpsubq %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: diff --git a/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll new file mode 100644 index 000000000000..9a5da33223ba --- /dev/null +++ b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll @@ -0,0 +1,37 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s -o /dev/null +; pr33001 - Check that llc doesn't crash when running with O0 option. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <4 x i32> @test_masked_load(<4 x i32>* %base, <4 x i1> %mask) { + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %base, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) + + +define void @test_masked_store(<4 x i32>* %base, <4 x i32> %value, <4 x i1> %mask) { + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %base, i32 4, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) + + +define <4 x i32> @llvm_masked_gather(<4 x i32*> %ptrs, <4 x i1> %mask) { + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) + + +define void @llvm_masked_scatter(<4 x i32*> %ptrs, <4 x i32> %value, <4 x i1> %mask) { + call void @llvm.masked.scatter.v4i32(<4 x i32> %value, <4 x i32*> %ptrs, i32 4, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) + diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 5d5150ad62d6..4be3a4c2391b 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index b8a8b8afd14f..6a565a5c76f0 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index ce42d0d643e8..1afef86a5f11 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: -; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 2996edaec3e0..332bf2887fb0 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll index c869dff9e642..6701c247e6fc 100644 --- a/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll index d46082f20a45..cbd5c69b1772 100644 --- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -5,9 +5,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { ; AVX2-LABEL: foo2: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,1] -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vmovapd %xmm1, (%rdi) +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX2-NEXT: vmovapd %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> %res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> <i32 1, i32 undef> @@ -18,9 +17,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind { ; AVX2-LABEL: foo4: ; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3] -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef> @@ -32,10 +30,8 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind { ; AVX2-LABEL: foo8: ; AVX2: # BB#0: ; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7> -; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7> @@ -46,7 +42,7 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind { define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> @@ -66,7 +62,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind { define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3> @@ -76,9 +72,10 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> @@ -89,9 +86,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask5: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3> diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 0b03dffe99b5..d99cfaf535de 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: retl ; @@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind { define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { ; X32-LABEL: test_mm_setr_ps: ; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_ps: diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index dfc1aefd31a6..68ab3f9f3205 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: jne .LBB1_8 ; X32-NEXT: .LBB1_7: ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: jmp .LBB1_9 +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: je .LBB1_10 +; X32-NEXT: jmp .LBB1_11 ; X32-NEXT: .LBB1_1: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) @@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: je .LBB1_7 ; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 -; X32-NEXT: .LBB1_9: # %entry -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: jne .LBB1_11 -; X32-NEXT: # BB#10: +; X32-NEXT: .LBB1_10: ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: .LBB1_11: # %entry ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: jne .LBB1_8 ; X64-NEXT: .LBB1_7: ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: jmp .LBB1_9 +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi +; X64-NEXT: je .LBB1_10 +; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: testl %edx, %edx @@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: je .LBB1_7 ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 -; X64-NEXT: .LBB1_9: # %entry -; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi ; X64-NEXT: jne .LBB1_11 -; X64-NEXT: # BB#10: +; X64-NEXT: .LBB1_10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: .LBB1_11: # %entry ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll index 4d895ea264c5..aed5e0d1c32e 100644 --- a/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: addss %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 503b9416c8d3..4a0dc9c1eb17 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll index b83a8d61f6a2..df5ed5431b8a 100644 --- a/test/CodeGen/X86/stackmap-frame-setup.ll +++ b/test/CodeGen/X86/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def +; ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def ret void diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index a42b3c96c3ae..1eef67764ab9 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: ; AVX1-NEXT: movq %rax, %rcx @@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: .LBB80_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax @@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx @@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB80_21: +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: ; AVX2-NEXT: movq %rax, %rcx @@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX2-NEXT: .LBB80_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax @@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovq %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx @@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: .LBB80_21: +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll index 443264cdffd4..51c8b2111107 100644 --- a/test/CodeGen/X86/vec_set-2.ll +++ b/test/CodeGen/X86/vec_set-2.ll @@ -1,11 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test1(float %a) nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test1: +; X86: # BB#0: +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2 @@ -14,10 +22,15 @@ define <4 x float> @test1(float %a) nounwind { } define <2 x i64> @test(i32 %a) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: retq %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1 %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index ee4a08599968..b34f30924a8d 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test(float %a) { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 @@ -13,11 +19,17 @@ define <4 x float> @test(float %a) { } define <2 x i64> @test2(i32 %a) { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X64-NEXT: retq %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> @@ -25,10 +37,15 @@ define <2 x i64> @test2(i32 %a) { } define <4 x float> @test3(<4 x float> %A) { -; CHECK-LABEL: test3: -; CHECK: # BB#0: -; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test3: +; X86: # BB#0: +; X86-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X64-NEXT: retq %tmp0 = extractelement <4 x float> %A, i32 0 %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1 %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2 diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll index 8f35529d61b4..09142e16aa6e 100644 --- a/test/CodeGen/X86/vec_set-4.ll +++ b/test/CodeGen/X86/vec_set-4.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(i16 %a) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pinsrw $3, %edi, %xmm0 +; X64-NEXT: retq %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3 %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4 %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5 @@ -17,12 +24,19 @@ define <2 x i64> @test(i16 %a) nounwind { } define <2 x i64> @test2(i8 %a) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: pinsrw $5, %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: pinsrw $5, %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pinsrw $5, %eax, %xmm0 +; X64-NEXT: retq %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10 %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11 %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12 diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll index 4429834b8ef0..3c9aca3a02da 100644 --- a/test/CodeGen/X86/vec_set-6.ll +++ b/test/CodeGen/X86/vec_set-6.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test(float %a, float %b, float %c) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll index e8fe6debb140..757a0d44cd43 100644 --- a/test/CodeGen/X86/vec_set-7.ll +++ b/test/CodeGen/X86/vec_set-7.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(<2 x i64>* %p) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: retq %tmp = bitcast <2 x i64>* %p to double* %tmp.upgrd.1 = load double, double* %tmp %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0 diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll index 7a4326c01bb7..a9dceb90855a 100644 --- a/test/CodeGen/X86/vec_set-8.ll +++ b/test/CodeGen/X86/vec_set-8.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(i64 %i) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: retq +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 ret <2 x i64> %tmp11 diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll index cae39a3d775b..259ace98d362 100644 --- a/test/CodeGen/X86/vec_set-A.ll +++ b/test/CodeGen/X86/vec_set-A.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test1() nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test1: +; X86: # BB#0: +; X86-NEXT: movl $1, %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movl $1, %eax +; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: retq ret <2 x i64> < i64 1, i64 0 > } diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll index 0580a3376656..ecd9b57cfd0c 100644 --- a/test/CodeGen/X86/vec_set-B.ll +++ b/test/CodeGen/X86/vec_set-B.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; These should both generate something like this: ;_test3: @@ -9,26 +10,37 @@ ; ret define <2 x i64> @test3(i64 %arg) nounwind { -; CHECK-LABEL: test3: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687 -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test3: +; X86: # BB#0: +; X86-NEXT: movl $1234567, %eax # imm = 0x12D687 +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: andl $1234567, %edi # imm = 0x12D687 +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %A = and i64 %arg, 1234567 %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0 ret <2 x i64> %B } define <2 x i64> @test2(i64 %arg) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687 -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movl $1234567, %eax # imm = 0x12D687 +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: andl $1234567, %edi # imm = 0x12D687 +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %A = and i64 %arg, 1234567 %B = insertelement <2 x i64> undef, i64 %A, i32 0 ret <2 x i64> %B } - diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll index 994bc2b3056e..865e2fb83f17 100644 --- a/test/CodeGen/X86/vec_set-C.ll +++ b/test/CodeGen/X86/vec_set-C.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64 define <2 x i64> @t1(i64 %x) nounwind { -; X32-LABEL: t1: -; X32: # BB#0: -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: retl +; X86-LABEL: t1: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl ; ; X64-LABEL: t1: ; X64: # BB#0: diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll index 49bd3beef75a..6439a6dcb00b 100644 --- a/test/CodeGen/X86/vec_set.ll +++ b/test/CodeGen/X86/vec_set.ll @@ -1,27 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: movdqa %xmm3, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: movdqa %xmm3, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movd %r8d, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movd %r9d, %xmm2 +; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-NEXT: movdqa %xmm3, (%rdi) +; X64-NEXT: retq %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1 %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2 diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 226c0adbaf3c..2fb821555dba 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index a05a981daa1f..f0a5fe1dbfff 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll index f1f795bf3cb0..e3261d15538f 100644 --- a/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1,15 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD ; ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: @@ -194,16 +196,46 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv2i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv2i64: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv2i64: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64: @@ -429,16 +461,46 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv2i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv2i64u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv2i64u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64u: @@ -651,16 +713,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv4i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i32: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i32: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32: @@ -867,16 +954,41 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv4i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i32u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i32u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32u: @@ -1054,8 +1166,28 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv8i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i16: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 @@ -1063,7 +1195,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i16: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 @@ -1238,8 +1370,28 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv8i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i16u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 @@ -1247,7 +1399,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i16u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 @@ -1399,8 +1551,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv16i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 @@ -1546,8 +1713,23 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv16i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i8u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 @@ -1582,17 +1764,17 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64: -; AVX: # BB#0: -; AVX-NEXT: movl $55, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; NOBW-LABEL: foldv2i64: +; NOBW: # BB#0: +; NOBW-NEXT: movl $55, %eax +; NOBW-NEXT: vmovq %rax, %xmm0 +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv2i64: -; AVX512: ## BB#0: -; AVX512-NEXT: movl $55, %eax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv2i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: movl $55, %eax +; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # BB#0: @@ -1610,17 +1792,17 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64u: -; AVX: # BB#0: -; AVX-NEXT: movl $55, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; NOBW-LABEL: foldv2i64u: +; NOBW: # BB#0: +; NOBW-NEXT: movl $55, %eax +; NOBW-NEXT: vmovq %rax, %xmm0 +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv2i64u: -; AVX512: ## BB#0: -; AVX512-NEXT: movl $55, %eax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv2i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: movl $55, %eax +; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # BB#0: @@ -1637,15 +1819,15 @@ define <4 x i32> @foldv4i32() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX-NEXT: retq +; NOBW-LABEL: foldv4i32: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv4i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv4i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # BB#0: @@ -1661,15 +1843,15 @@ define <4 x i32> @foldv4i32u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX-NEXT: retq +; NOBW-LABEL: foldv4i32u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv4i32u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv4i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # BB#0: @@ -1685,15 +1867,15 @@ define <8 x i16> @foldv8i16() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX-NEXT: retq +; NOBW-LABEL: foldv8i16: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv8i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # BB#0: @@ -1709,15 +1891,15 @@ define <8 x i16> @foldv8i16u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX-NEXT: retq +; NOBW-LABEL: foldv8i16u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv8i16u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv8i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # BB#0: @@ -1733,15 +1915,15 @@ define <16 x i8> @foldv16i8() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX-NEXT: retq +; NOBW-LABEL: foldv16i8: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv16i8: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv16i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # BB#0: @@ -1757,15 +1939,15 @@ define <16 x i8> @foldv16i8u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX-NEXT: retq +; NOBW-LABEL: foldv16i8u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv16i8u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv16i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # BB#0: diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll index 53cb4d8e445b..185e1f4865ea 100644 --- a/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/test/CodeGen/X86/vector-lzcnt-256.ll @@ -1,11 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD ; ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -93,16 +95,76 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv4i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i64: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i64: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv4i64: @@ -225,16 +287,76 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv4i64u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv4i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i64u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i64u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv4i64u: @@ -342,16 +464,66 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv8i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i32: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i32: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv8i32: @@ -454,16 +626,66 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv8i32u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv8i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i32u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i32u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv8i32u: @@ -551,8 +773,48 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv16i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv16i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 @@ -638,8 +900,48 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv16i16u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv16i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i16u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 @@ -710,8 +1012,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv32i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv32i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 @@ -784,8 +1116,38 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv32i8u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv32i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv32i8u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 @@ -818,15 +1180,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { } define <4 x i64> @foldv4i64() nounwind { -; AVX-LABEL: foldv4i64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv4i64: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # BB#0: @@ -837,15 +1194,10 @@ define <4 x i64> @foldv4i64() nounwind { } define <4 x i64> @foldv4i64u() nounwind { -; AVX-LABEL: foldv4i64u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv4i64u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # BB#0: @@ -856,15 +1208,10 @@ define <4 x i64> @foldv4i64u() nounwind { } define <8 x i32> @foldv8i32() nounwind { -; AVX-LABEL: foldv8i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv8i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32: ; X32-AVX: # BB#0: @@ -875,15 +1222,10 @@ define <8 x i32> @foldv8i32() nounwind { } define <8 x i32> @foldv8i32u() nounwind { -; AVX-LABEL: foldv8i32u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv8i32u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32u: ; X32-AVX: # BB#0: @@ -894,15 +1236,15 @@ define <8 x i32> @foldv8i32u() nounwind { } define <16 x i16> @foldv16i16() nounwind { -; AVX-LABEL: foldv16i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv16i16: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # BB#0: @@ -913,15 +1255,15 @@ define <16 x i16> @foldv16i16() nounwind { } define <16 x i16> @foldv16i16u() nounwind { -; AVX-LABEL: foldv16i16u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv16i16u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv16i16u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # BB#0: @@ -932,15 +1274,15 @@ define <16 x i16> @foldv16i16u() nounwind { } define <32 x i8> @foldv32i8() nounwind { -; AVX-LABEL: foldv32i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv32i8: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv32i8: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # BB#0: @@ -951,15 +1293,15 @@ define <32 x i8> @foldv32i8() nounwind { } define <32 x i8> @foldv32i8u() nounwind { -; AVX-LABEL: foldv32i8u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv32i8u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv32i8u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # BB#0: diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll new file mode 100644 index 000000000000..f737ea2b7fba --- /dev/null +++ b/test/CodeGen/X86/vector-narrow-binop.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ + +; AVX1 has support for 256-bit bitwise logic because the FP variants were included. +; If using those ops requires extra insert/extract though, it's probably not worth it. + +define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { +; SSE-LABEL: PR32790: +; SSE: # BB#0: +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: psubd %xmm6, %xmm0 +; SSE-NEXT: psubd %xmm7, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: PR32790: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR32790: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR32790: +; AVX512: # BB#0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: retq + %add = add <8 x i32> %a, %b + %and = and <8 x i32> %add, %c + %sub = sub <8 x i32> %and, %d + ret <8 x i32> %sub +} + +; In a more extreme case, even the later AVX targets should avoid extract/insert just +; because 256-bit ops are supported. + +define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { +; SSE-LABEL: do_not_use_256bit_op: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: do_not_use_256bit_op: +; AVX1: # BB#0: +; AVX1-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> +; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: do_not_use_256bit_op: +; AVX2: # BB#0: +; AVX2-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> +; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: do_not_use_256bit_op: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def> +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def> +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %and = and <8 x i32> %concat1, %concat2 + %extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %sub = sub <4 x i32> %extract1, %extract2 + ret <4 x i32> %sub +} + diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll index f05588a2920c..99a05c3d49c0 100644 --- a/test/CodeGen/X86/vector-pcmp.ll +++ b/test/CodeGen/X86/vector-pcmp.ll @@ -148,8 +148,8 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) { ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -177,8 +177,8 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -206,8 +206,8 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -242,14 +242,13 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) { ; ; AVX1-LABEL: test_pcmpgtq_256: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index af3ddcf8048e..09e143ddcd4d 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -81,6 +85,41 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, %b ret <4 x i64> %shift } @@ -147,6 +186,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; X32-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, %b ret <8 x i32> %shift } @@ -253,6 +327,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -436,6 +559,89 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -499,6 +705,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i64> %a, %splat ret <4 x i64> %shift @@ -546,6 +779,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = ashr <8 x i32> %a, %splat ret <8 x i32> %shift @@ -593,6 +841,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = ashr <16 x i16> %a, %splat ret <16 x i16> %shift @@ -776,6 +1039,84 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = ashr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -843,6 +1184,43 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> ret <4 x i64> %shift } @@ -893,6 +1271,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X32-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> ret <8 x i32> %shift } @@ -980,6 +1381,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsraw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; X32-AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -1149,6 +1584,81 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } @@ -1206,6 +1716,25 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> ret <4 x i64> %shift } @@ -1246,6 +1775,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ret <8 x i32> %shift } @@ -1286,6 +1828,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> ret <16 x i16> %shift } @@ -1352,6 +1907,31 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index 60575250d713..46be36b76e98 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -59,6 +63,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, %b ret <4 x i64> %shift } @@ -125,6 +149,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; X32-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, %b ret <8 x i32> %shift } @@ -231,6 +290,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -357,6 +465,56 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -401,6 +559,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = lshr <4 x i64> %a, %splat ret <4 x i64> %shift @@ -448,6 +623,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = lshr <8 x i32> %a, %splat ret <8 x i32> %shift @@ -495,6 +685,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = lshr <16 x i16> %a, %splat ret <16 x i16> %shift @@ -625,6 +830,55 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = lshr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -677,6 +931,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> ret <4 x i64> %shift } @@ -727,6 +1002,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X32-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> ret <8 x i32> %shift } @@ -814,6 +1112,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; X32-AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -930,6 +1262,52 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } @@ -974,6 +1352,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> ret <4 x i64> %shift } @@ -1014,6 +1405,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ret <8 x i32> %shift } @@ -1054,6 +1458,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> ret <16 x i16> %shift } @@ -1103,6 +1520,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll index 7f534050b6a7..4a134f440a78 100644 --- a/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -56,6 +60,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, %b ret <4 x i64> %shift } @@ -105,6 +129,27 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; X32-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; X32-AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, %b ret <8 x i32> %shift } @@ -205,6 +250,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsllw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift } @@ -319,6 +413,52 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift } @@ -363,6 +503,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = shl <4 x i64> %a, %splat ret <4 x i64> %shift @@ -410,6 +567,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = shl <8 x i32> %a, %splat ret <8 x i32> %shift @@ -457,6 +629,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = shl <16 x i16> %a, %splat ret <16 x i16> %shift @@ -577,6 +764,51 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = shl <32 x i8> %a, %splat ret <32 x i8> %shift @@ -626,6 +858,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> ret <4 x i64> %shift } @@ -666,6 +919,19 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> ret <8 x i32> %shift } @@ -719,6 +985,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -827,6 +1106,48 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } @@ -871,6 +1192,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> ret <4 x i64> %shift } @@ -911,6 +1245,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpslld $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ret <8 x i32> %shift } @@ -951,6 +1298,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> ret <16 x i16> %shift } @@ -999,6 +1359,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index 26cd7301fe60..7a5c992bb829 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -1,129 +1,235 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefixes=ALL,KNL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefixes=ALL,SKX %s target triple = "x86_64-unknown-unknown" -define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 -; ALL-NEXT: retq +define <32 x i16> @shuffle_v32i16(<32 x i16> %a) { +; KNL-LABEL: shuffle_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16: +; SKX: ## BB#0: +; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: -; ALL: # BB#0: -; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; SKX: ## BB#0: +; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: -; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> -; ALL-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: +; KNL: ## BB#0: +; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] +; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] +; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> +; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> +; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0> +; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: +; SKX: ## BB#0: +; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: -; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] -; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7] +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 +; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7] +; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15] +; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 +; KNL-NEXT: vpbroadcastw %xmm3, %ymm3 +; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: +; SKX: ## BB#0: +; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] +; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56> ret <32 x i16> %c } define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: -; ALL: # BB#0: -; ALL-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +; KNL: ## BB#0: +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +; SKX: ## BB#0: +; SKX-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <32 x i16> %c } define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: -; ALL: # BB#0: -; ALL-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +; KNL: ## BB#0: +; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +; SKX: ## BB#0: +; SKX-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: -; ALL: # BB#0: -; ALL-NEXT: vpsrld $16, %zmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: +; KNL: ## BB#0: +; KNL-NEXT: vpsrld $16, %ymm0, %ymm0 +; KNL-NEXT: vpsrld $16, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: +; SKX: ## BB#0: +; SKX-NEXT: vpsrld $16, %zmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: -; ALL: # BB#0: -; ALL-NEXT: vpslld $16, %zmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $16, %ymm0, %ymm0 +; KNL-NEXT: vpslld $16, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $16, %zmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: -; ALL: # BB#0: -; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: +; KNL: ## BB#0: +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: +; SKX: ## BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: -; ALL: # BB#0: -; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: +; KNL: ## BB#0: +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: +; SKX: ## BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: -; ALL: # BB#0: -; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: +; KNL: ## BB#0: +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: +; SKX: ## BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; ALL: # BB#0: -; ALL-NEXT: movl $1, %eax -; ALL-NEXT: kmovd %eax, %k1 -; ALL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; KNL: ## BB#0: +; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF +; KNL-NEXT: vmovd %eax, %xmm1 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; SKX: ## BB#0: +; SKX-NEXT: movl $1, %eax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32> ret <32 x i16> %shuffle } define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { -; ALL-LABEL: insert_dup_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -132,11 +238,19 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { } define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { -; ALL-LABEL: insert_dup_mem_v32i16_sext_i16: -; ALL: # BB#0: -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: +; KNL: ## BB#0: +; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: +; SKX: ## BB#0: +; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -146,11 +260,17 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { } define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { -; ALL-LABEL: insert_dup_elt1_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movzwl 2(%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -159,11 +279,17 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { } define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { -; ALL-LABEL: insert_dup_elt3_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movzwl 2(%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -172,19 +298,79 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { } define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: -; ALL: # BB#0: -; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0> ret <32 x i16> %shuffle } define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: -; ALL: # BB#0: -; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0> ret <32 x i16> %shuffle } + +define <8 x i16> @pr32967(<32 x i16> %v) { +; KNL-LABEL: pr32967: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; KNL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; KNL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; KNL-NEXT: retq +; +; SKX-LABEL: pr32967: +; SKX: ## BB#0: +; SKX-NEXT: vpextrw $5, %xmm0, %eax +; SKX-NEXT: vpextrw $1, %xmm0, %ecx +; SKX-NEXT: vmovd %ecx, %xmm1 +; SKX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; SKX-NEXT: vpextrw $1, %xmm2, %eax +; SKX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpextrw $5, %xmm2, %eax +; SKX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; SKX-NEXT: vpextrw $1, %xmm2, %eax +; SKX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; SKX-NEXT: vpextrw $1, %xmm0, %eax +; SKX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpextrw $5, %xmm0, %eax +; SKX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29> + ret <8 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll index c5ac4466b5fa..8081e9482d67 100644 --- a/test/CodeGen/X86/vector-sqrt.ll +++ b/test/CodeGen/X86/vector-sqrt.ll @@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vsqrtss 12(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: retq entry: %0 = load float, float* %v, align 4 diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll index 34a9df1782a4..f5ec8e540b0b 100644 --- a/test/CodeGen/X86/viabs.ll +++ b/test/CodeGen/X86/viabs.ll @@ -405,16 +405,16 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind { ; ; AVX1-LABEL: test_abs_ge_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_ge_v2i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -447,21 +447,20 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: test_abs_gt_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_gt_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -504,35 +503,31 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { ; AVX1-LABEL: test_abs_le_v8i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -581,37 +576,33 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind { ; AVX1-NEXT: vmovdqu (%rdi), %ymm0 ; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64_fold: ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll index d33fda4f49c2..7807991b455d 100644 --- a/test/CodeGen/X86/vselect-pcmp.ll +++ b/test/CodeGen/X86/vselect-pcmp.ll @@ -35,9 +35,7 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) ; AVX: # BB#0: ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpandn %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %tr = icmp slt <8 x i16> %mask, zeroinitializer %z = select <8 x i1> %tr, <8 x i16> %x, <8 x i16> %y @@ -162,18 +160,14 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: signbit_sel_v16i16: ; AVX512: # BB#0: ; AVX512-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX512-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %tr = icmp slt <16 x i16> %mask, zeroinitializer %z = select <16 x i1> %tr, <16 x i16> %x, <16 x i16> %y diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 6fbec91e77a3..450e255313b3 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll index 7e370c25e31b..3052a0f615eb 100644 --- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll +++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py for function "bar" ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; In functions with 'no_caller_saved_registers' attribute, all registers should ;; be preserved except for registers used for passing/returning arguments. ;; In the following function registers %RDI, %RSI and %XMM0 are used to store ;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX. @@ -28,20 +28,20 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 { ret i32 4 } -;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
-;; doesn't need to preserve registers except for the arguments passed
+;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo" +;; doesn't need to preserve registers except for the arguments passed ;; to "bar" (%ESI, %EDI and %XMM0). define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) { -; CHECK-LABEL: foo
-; CHECK: movaps %xmm0, %xmm1
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: callq bar
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtsi2ssl %eax, %xmm0
-; CHECK-NEXT: addss %xmm0, %xmm1
+; CHECK-LABEL: foo +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: callq bar +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ssl %eax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm1 ; CHECK: retq %call = call i32 @bar(i32 %a0, i32 %a1, float %b0) #0 %c0 = add i32 %a0, %call diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/test/CodeGen/X86/x86-no_caller_saved_registers.ll index 9c62e3ee6ba7..4e5403d1847f 100644 --- a/test/CodeGen/X86/x86-no_caller_saved_registers.ll +++ b/test/CodeGen/X86/x86-no_caller_saved_registers.ll @@ -1,31 +1,31 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; In functions with 'no_caller_saved_registers' attribute, all registers should
-;; be preserved except for registers used for passing/returning arguments.
-;; The test checks that function "bar" preserves xmm0 register.
-;; It also checks that caller function "foo" does not store registers for callee
-;; "bar". For example, there is no store/load/access to xmm registers.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
-; CHECK-LABEL: bar
-; CHECK: mov{{.*}} %xmm0
-; CHECK: mov{{.*}} {{.*}}, %xmm0
-; CHECK: ret
- call void asm sideeffect "", "~{xmm0}"()
- ret i32 1
-}
-
-define x86_intrcc void @foo(i8* nocapture readnone %c) {
-; CHECK-LABEL: foo
-; CHECK-NOT: xmm
-entry:
- tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
- ret void
-}
-
-attributes #0 = { "no_caller_saved_registers" }
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; In functions with 'no_caller_saved_registers' attribute, all registers should +;; be preserved except for registers used for passing/returning arguments. +;; The test checks that function "bar" preserves xmm0 register. +;; It also checks that caller function "foo" does not store registers for callee +;; "bar". For example, there is no store/load/access to xmm registers. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 { +; CHECK-LABEL: bar +; CHECK: mov{{.*}} %xmm0 +; CHECK: mov{{.*}} {{.*}}, %xmm0 +; CHECK: ret + call void asm sideeffect "", "~{xmm0}"() + ret i32 1 +} + +define x86_intrcc void @foo(i8* nocapture readnone %c) { +; CHECK-LABEL: foo +; CHECK-NOT: xmm +entry: + tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0 + ret void +} + +attributes #0 = { "no_caller_saved_registers" } diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll index 5b6e773fe5d4..519f0d0924e3 100644 --- a/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -270,8 +270,6 @@ if.end: ; preds = %if.else, %for.end ret i32 %sum.1 } -declare void @somethingElse(...) - ; Check with a more complex case that we do not have restore within the loop and ; save outside. ; CHECK-LABEL: loopInfoRestoreOutsideLoop: @@ -982,3 +980,54 @@ for.inc: } attributes #4 = { "no-frame-pointer-elim"="true" } + +@x = external global i32, align 4 +@y = external global i32, align 4 + +; The post-dominator tree does not include the branch containing the infinite +; loop, which can occur into a misplacement of the restore block, if we're +; looking for the nearest common post-dominator of an "unreachable" block. + +; CHECK-LABEL: infiniteLoopNoSuccessor: +; CHECK: ## BB#0: +; Make sure the prologue happens in the entry block. +; CHECK-NEXT: pushq %rbp +; ... +; Make sure we don't shrink-wrap. +; CHECK: ## BB#1 +; CHECK-NOT: pushq %rbp +; ... +; Make sure the epilogue happens in the exit block. +; CHECK: ## BB#5 +; CHECK: popq %rbp +; CHECK-NEXT: retq +define void @infiniteLoopNoSuccessor() #5 { + %1 = load i32, i32* @x, align 4 + %2 = icmp ne i32 %1, 0 + br i1 %2, label %3, label %4 + +; <label>:3: + store i32 0, i32* @x, align 4 + br label %4 + +; <label>:4: + call void (...) @somethingElse() + %5 = load i32, i32* @y, align 4 + %6 = icmp ne i32 %5, 0 + br i1 %6, label %10, label %7 + +; <label>:7: + %8 = call i32 (...) @something() + br label %9 + +; <label>:9: + call void (...) @somethingElse() + br label %9 + +; <label>:10: + ret void +} + +declare void @somethingElse(...) + +attributes #5 = { nounwind "no-frame-pointer-elim-non-leaf" } diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll index a100a1425dd1..5f56e2d80d73 100644 --- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll @@ -499,8 +499,8 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { ; X32-LABEL: test_mm256_cmov_si256: ; X32: # BB#0: -; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; X32-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; X32-NEXT: vxorps %ymm3, %ymm3, %ymm3 +; X32-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3 ; X32-NEXT: vxorps %ymm3, %ymm2, %ymm3 ; X32-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X32-NEXT: vandps %ymm3, %ymm1, %ymm1 @@ -509,8 +509,8 @@ define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> ; ; X64-LABEL: test_mm256_cmov_si256: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; X64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; X64-NEXT: vxorps %ymm3, %ymm3, %ymm3 +; X64-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3 ; X64-NEXT: vxorps %ymm3, %ymm2, %ymm3 ; X64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-NEXT: vandps %ymm3, %ymm1, %ymm1 |