diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-01-09 21:23:09 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-01-09 21:23:09 +0000 |
| commit | 909545a822eef491158f831688066f0ec2866938 (patch) | |
| tree | 5b0bf0e81294007a9b462b21031b3df272c655c3 /test/CodeGen | |
| parent | 7e7b6700743285c0af506ac6299ddf82ebd434b9 (diff) | |
Diffstat (limited to 'test/CodeGen')
43 files changed, 7124 insertions, 2721 deletions
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll index 628d285141bc..eb79767e62be 100644 --- a/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -137,8 +137,8 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x ; v2i16 is naturally 4 byte aligned ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal +; EG: 16 ; EG: 16 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in @@ -153,11 +153,11 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ; GCN-DAG: s_sext_i32_i16 ; v2i16 is naturally 4 byte aligned +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; TODO: We should also use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal +; TODO: We should use ASHR instead of LSHR + BFE +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { @@ -167,16 +167,23 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ret void } -; FUNC-LABEL: {{^}}constant_constant_zextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32: ; GCN: s_load_dwordx2 ; v3i16 is naturally 8 byte aligned -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, +; EG: CF_END +; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 +; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 ; TODO: This should use DST, but for some there are redundant MOVs -; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG: 16 -define void @constant_constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: 16 +; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: 65535 +; EG-DAG: 65535 +define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -184,19 +191,20 @@ entry: ret void } -; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32: ; GCN: s_load_dwordx2 +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, ; v3i16 is naturally 8 byte aligned -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal +; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1 +; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 +; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -204,20 +212,24 @@ entry: ret void } -; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32: +; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32: ; GCN: s_load_dwordx2 ; GCN-DAG: s_and_b32 ; GCN-DAG: s_lshr_b32 ; v4i16 is naturally 8 byte aligned -; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}} +; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: This should use LD, but for some there are redundant MOVs +; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal ; EG-DAG: 16 -; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: 16 -define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: 65535 +; EG-DAG: 65535 +define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -230,13 +242,14 @@ define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* % ; GCN-DAG: s_sext_i32_i16 ; v4i16 is naturally 8 byte aligned -; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: This should use LD, but for some there are redundant MOVs +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal ; TODO: We should use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 @@ -254,24 +267,27 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; GCN-DAG: s_lshr_b32 ; v8i16 is naturally 16 byte aligned -; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, +; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: These should use LSHR instead of BFE_UINT +; TODO: This should use DST, but for some there are redundant MOVs +; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal +; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 +; EG-DAG: 65535 +; EG-DAG: 65535 +; EG-DAG: 65535 +; EG-DAG: 65535 define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -285,17 +301,19 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x ; GCN-DAG: s_sext_i32_i16 ; v8i16 is naturally 16 byte aligned -; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; TODO: We should use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, +; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT +; TODO: This should use DST, but for some there are redundant MOVs +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 @@ -444,7 +462,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace( ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal -; TODO: Why not 15 ? +; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in @@ -468,7 +486,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal -; TODO: Why not 15 ? +; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index f398dd32e06d..7bd131e6516c 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented @@ -10,7 +10,7 @@ ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}} ; GCN-HSA: flat_load_ushort -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { entry: %ld = load i16, i16 addrspace(1)* %in @@ -22,7 +22,7 @@ entry: ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -34,8 +34,8 @@ entry: ; GCN-NOHSA: buffer_load_dwordx2 v ; GCN-HSA: flat_load_dwordx2 v -; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 +; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in @@ -47,7 +47,7 @@ entry: ; GCN-NOHSA: buffer_load_dwordx2 ; GCN-HSA: flat_load_dwordx2 -; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in @@ -59,7 +59,7 @@ entry: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in @@ -74,8 +74,8 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in @@ -90,7 +90,7 @@ entry: ; GCN-HSA: flat_load_ushort ; GCN-HSA: flat_store_dword -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i32 @@ -105,9 +105,9 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; GCN-HSA: flat_load_sshort ; GCN-HSA: flat_store_dword -; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 -; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; EG: 16 +; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 +; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; EGCM: 16 define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i32 @@ -119,7 +119,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; GCN-NOHSA: buffer_load_ushort ; GCN-HSA: flat_load_ushort -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i32> @@ -131,9 +131,9 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; GCN-NOHSA: buffer_load_sshort ; GCN-HSA: flat_load_sshort -; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 -; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; EG: 16 +; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 +; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; EGCM: 16 define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i32> @@ -145,10 +145,9 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; GCN-NOHSA: buffer_load_dword ; GCN-HSA: flat_load_dword -; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG: 16 +; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 +; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal +; EGCM: 16 define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> @@ -161,13 +160,14 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ; GCN-HSA: flat_load_dword -; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; TODO: We should also use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal -; EG-DAG: 16 -; EG-DAG: 16 +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EGCM: VTX_READ_32 [[DST:T[0-9].[XYZW]]], [[DST]], 0, #1 +; TODO: This should use ASHR instead of LSHR + BFE +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal +; EGCM-DAG: 16 +; EGCM-DAG: 16 define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i32> @@ -175,16 +175,22 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ret void } -; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}global_zextload_v3i16_to_v3i32: ; GCN-NOHSA: buffer_load_dwordx2 ; GCN-HSA: flat_load_dwordx2 -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, +; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 ; TODO: This should use DST, but for some there are redundant MOVs -; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG: 16 -define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EGCM: 16 +; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal +; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal +define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -192,19 +198,23 @@ entry: ret void } -; FUNC-LABEL: {{^}}global_global_sextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}global_sextload_v3i16_to_v3i32: ; GCN-NOHSA: buffer_load_dwordx2 ; GCN-HSA: flat_load_dwordx2 -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal -; EG-DAG: 16 -; EG-DAG: 16 -define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, +; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1 +; TODO: This should use DST, but for some there are redundant MOVs +; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal +; EGCM-DAG: 16 +; EGCM-DAG: 16 +define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -212,19 +222,22 @@ entry: ret void } -; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32: +; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i32: ; GCN-NOHSA: buffer_load_dwordx2 ; GCN-HSA: flat_load_dwordx2 -; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG-DAG: 16 -; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal -; EG-DAG: 16 -define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: This should use DST, but for some there are redundant MOVs +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*}}, literal +; EGCM-DAG: 16 +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal +; EGCM-DAG: 16 +define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -236,17 +249,19 @@ define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, ; GCN-HSA: flat_load_dwordx2 -; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs +; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 ; TODO: We should use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 +; TODO: This should use DST, but for some there are redundant MOVs +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i32> @@ -258,16 +273,29 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 +; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} +; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EGCM: CF_END +; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: These should use LSHR instead of BFE_UINT +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal +; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal +; EGCM-DAG: 65535 +; EGCM-DAG: 65535 +; EGCM-DAG: 65535 +; EGCM-DAG: 65535 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -279,24 +307,29 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 -; EG-DAG: 16 +; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} +; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}} +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, +; EGCM: CF_END +; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 +; TODO: These should use ASHR instead of LSHR + BFE_INT +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 +; EGCM-DAG: 16 define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -311,8 +344,8 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -322,8 +355,8 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32: -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -342,10 +375,10 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -364,10 +397,10 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -394,14 +427,14 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> @@ -411,14 +444,14 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32: -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 -; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 +; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i32> @@ -434,8 +467,8 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -; EG: MOV {{.*}}, 0.0 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: MOV {{.*}}, 0.0 define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i64 @@ -458,10 +491,10 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal -; TODO: Why not 15 ? -; EG: 31 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal +; TODO: These could be expanded earlier using ASHR 15 +; EGCM: 31 define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i64 @@ -471,8 +504,8 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64: -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -; EG: MOV {{.*}}, 0.0 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: MOV {{.*}}, 0.0 define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i64> @@ -482,10 +515,10 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64: -; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal -; TODO: Why not 15 ? -; EG: 31 +; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal +; TODO: These could be expanded earlier using ASHR 15 +; EGCM: 31 define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i64> @@ -503,7 +536,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64: -; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i64> @@ -513,7 +546,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64: -; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i64> @@ -523,7 +556,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64: -; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i64> @@ -533,7 +566,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64: -; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i64> @@ -543,7 +576,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64: -; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i64> @@ -553,8 +586,8 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64: -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i64> @@ -564,8 +597,8 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64: -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i64> @@ -575,10 +608,10 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64: -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i64> @@ -588,10 +621,10 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64: -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 -; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 +; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i64> diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll index 5d64a152af3c..13d56535303f 100644 --- a/test/CodeGen/AMDGPU/min.ll +++ b/test/CodeGen/AMDGPU/min.ll @@ -1,10 +1,9 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: -; SI: v_min_i32_e32 +; GCN: v_min_i32_e32 ; EG: MIN_INT define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -17,7 +16,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr } ; FUNC-LABEL: {{^}}s_test_imin_sle_i32: -; SI: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -28,7 +27,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin } ; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32: -; SI: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { @@ -39,10 +38,10 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32: -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 +; GCN: s_min_i32 +; GCN: s_min_i32 +; GCN: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT ; EG: MIN_INT @@ -56,11 +55,11 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_imin_sle_i8: -; SI: s_load_dword -; SI: s_load_dword -; SI: s_sext_i32_i8 -; SI: s_sext_i32_i8 -; SI: s_min_i32 +; GCN: s_load_dword +; GCN: s_load_dword +; GCN: s_sext_i32_i8 +; GCN: s_sext_i32_i8 +; GCN: s_min_i32 define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b @@ -72,21 +71,26 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { ; extloads with mubuf instructions. ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte +; GCN: buffer_load_sbyte ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 -; SI: s_endpgm +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 + +; GCN: s_endpgm ; EG: MIN_INT ; EG: MIN_INT @@ -117,7 +121,7 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, < } ; FUNC-LABEL: @v_test_imin_slt_i32 -; SI: v_min_i32_e32 +; GCN: v_min_i32_e32 ; EG: MIN_INT define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -130,7 +134,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr } ; FUNC-LABEL: @s_test_imin_slt_i32 -; SI: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -141,8 +145,8 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin } ; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32: -; SI: s_min_i32 -; SI: s_min_i32 +; GCN: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT ; EG: MIN_INT @@ -154,7 +158,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { @@ -165,7 +169,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { } ; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { @@ -176,7 +180,7 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { } ; FUNC-LABEL: @v_test_umin_ule_i32 -; SI: v_min_u32_e32 +; GCN: v_min_u32_e32 ; EG: MIN_UINT define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -189,11 +193,11 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr } ; FUNC-LABEL: @v_test_umin_ule_v3i32 -; SI: v_min_u32_e32 -; SI: v_min_u32_e32 -; SI: v_min_u32_e32 +; GCN: v_min_u32_e32 +; GCN: v_min_u32_e32 +; GCN: v_min_u32_e32 ; SI-NOT: v_min_u32_e32 -; SI: s_endpgm +; GCN: s_endpgm ; EG: MIN_UINT ; EG: MIN_UINT @@ -207,7 +211,7 @@ define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrs ret void } ; FUNC-LABEL: @s_test_umin_ule_i32 -; SI: s_min_u32 +; GCN: s_min_u32 ; EG: MIN_UINT define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -218,7 +222,7 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin } ; FUNC-LABEL: @v_test_umin_ult_i32 -; SI: v_min_u32_e32 +; GCN: v_min_u32_e32 ; EG: MIN_UINT define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -231,9 +235,9 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr } ; FUNC-LABEL: {{^}}v_test_umin_ult_i8: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_min_u32_e32 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: v_min_u32_e32 ; EG: MIN_UINT define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { @@ -246,7 +250,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i } ; FUNC-LABEL: @s_test_umin_ult_i32 -; SI: s_min_u32 +; GCN: s_min_u32 ; EG: MIN_UINT define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -258,10 +262,10 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use ; SI-NOT: v_min -; SI: v_cmp_lt_u32 +; GCN: v_cmp_lt_u32 ; SI-NEXT: v_cndmask_b32 ; SI-NOT: v_min -; SI: s_endpgm +; GCN: s_endpgm ; EG-NOT: MIN_UINT define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -274,9 +278,27 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace ret void } +; FUNC-LABEL: @v_test_umin_ult_i16_multi_use +; GCN-NOT: v_min +; GCN: v_cmp_lt_u32 +; GCN-NEXT: v_cndmask_b32 +; GCN-NOT: v_min +; GCN: s_endpgm + +; EG-NOT: MIN_UINT +define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %a = load i16, i16 addrspace(1)* %aptr, align 2 + %b = load i16, i16 addrspace(1)* %bptr, align 2 + %cmp = icmp ult i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out0, align 2 + store i1 %cmp, i1 addrspace(1)* %out1 + ret void +} + ; FUNC-LABEL: @s_test_umin_ult_v1i32 -; SI: s_min_u32 +; GCN: s_min_u32 ; EG: MIN_UINT define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { @@ -287,14 +309,14 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32: -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 +; GCN: s_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT @@ -312,14 +334,14 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 +; GCN: v_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT @@ -338,11 +360,11 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, < ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI: buffer_store_dword [[VMIN]] +; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { @@ -358,11 +380,11 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1 ; Make sure redundant sign_extend_inreg removed. ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI: buffer_store_dword [[VMIN]] +; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_INT define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { @@ -377,7 +399,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 } ; FUNC-LABEL: {{^}}s_test_imin_sle_i16: -; SI: s_min_i32 +; GCN: s_min_i32 ; EG: MIN_INT define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { @@ -389,7 +411,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin ; 64 bit ; FUNC-LABEL: {{^}}test_umin_ult_i64 -; SI: s_endpgm +; GCN: s_endpgm ; EG: MIN_UINT ; EG: MIN_UINT @@ -401,7 +423,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind } ; FUNC-LABEL: {{^}}test_umin_ule_i64 -; SI: s_endpgm +; GCN: s_endpgm ; EG: MIN_UINT ; EG: MIN_UINT @@ -413,7 +435,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind } ; FUNC-LABEL: {{^}}test_imin_slt_i64 -; SI: s_endpgm +; GCN: s_endpgm ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT @@ -425,7 +447,7 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind } ; FUNC-LABEL: {{^}}test_imin_sle_i64 -; SI: s_endpgm +; GCN: s_endpgm ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll new file mode 100644 index 000000000000..866a4a9191e2 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=r600 -mcpu=cypress -start-after safe-stack %s -o - | FileCheck %s +; Don't crash + +; CHECK: MAX_UINT +define void @test(i64 addrspace(1)* %out) { +bb: + store i64 2, i64 addrspace(1)* %out + %tmp = load i64, i64 addrspace(1)* %out + br label %jump + +jump: ; preds = %bb + %tmp1 = icmp ugt i64 %tmp, 4 + %umax = select i1 %tmp1, i64 %tmp, i64 4 + store i64 %umax, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll new file mode 100644 index 000000000000..33d27f24e9cf --- /dev/null +++ b/test/CodeGen/AMDGPU/store-private.ll @@ -0,0 +1,743 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}store_i1: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_byte +define void @store_i1(i1 addrspace(0)* %out) { +entry: + store i1 true, i1 addrspace(0)* %out + ret void +} + +; i8 store +; FUNC-LABEL: {{^}}store_i8: +; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x +; EG-NEXT: 2 +; EG: MOVA_INT * AR.x (MASKED) +; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x + +; IG 0: Get the byte index and truncate the value +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG-NEXT: 3(4.203895e-45) +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x +; EG-NEXT: 255(3.573311e-43) + +; EG: NOT_INT +; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]] +; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]] +; TODO: Is the reload necessary? +; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]] +; EG: MOV * T(0 + AR.x).X+, [[RES]] + +; SI: buffer_store_byte + +define void @store_i8(i8 addrspace(0)* %out, i8 %in) { +entry: + store i8 %in, i8 addrspace(0)* %out + ret void +} + +; i16 store +; FUNC-LABEL: {{^}}store_i16: +; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x +; EG-NEXT: 2 +; EG: MOVA_INT * AR.x (MASKED) +; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x + +; IG 0: Get the byte index and truncate the value +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG-NEXT: 3(4.203895e-45) +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x +; EG-NEXT: 65535(9.183409e-41) + +; EG: NOT_INT +; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]] +; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]] +; TODO: Is the reload necessary? +; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]] +; EG: MOV * T(0 + AR.x).X+, [[RES]] + +; SI: buffer_store_short +define void @store_i16(i16 addrspace(0)* %out, i16 %in) { +entry: + store i16 %in, i16 addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i24: +; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_short + +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store can be eliminated +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store can be eliminated +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +define void @store_i24(i24 addrspace(0)* %out, i24 %in) { +entry: + store i24 %in, i24 addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i25: +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} +; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] +; SI: buffer_store_dword [[VAND]] + +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG-NOT: MOVA_INT + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM-NOT: MOVA_INT +define void @store_i25(i25 addrspace(0)* %out, i25 %in) { +entry: + store i25 %in, i25 addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v2i8: +; v2i8 is naturally 2B aligned, treat as i16 +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG-NOT: MOVA_INT + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM-NOT: MOVA_INT + +; SI: buffer_store_short +define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v2i8_unaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_byte +define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1 + ret void +} + + +; FUNC-LABEL: {{^}}store_v2i16: +; v2i8 is naturally 2B aligned, treat as i16 +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG-NOT: MOVA_INT + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM-NOT: MOVA_INT + +; SI: buffer_store_dword +define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v2i16_unaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_short +; SI: buffer_store_short +define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG-NOT: MOVA_INT + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM-NOT: MOVA_INT + +; SI: buffer_store_dword +define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8_unaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI-NOT: buffer_store_dword +define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_v8i8_unaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI-NOT: buffer_store_dword +define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { +entry: + %0 = trunc <8 x i32> %in to <8 x i8> + store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8_halfaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; TODO: This load and store cannot be eliminated, +; they might be different locations +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_short +; SI: buffer_store_short +; SI-NOT: buffer_store_dword +define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2 + ret void +} + +; floating-point store +; FUNC-LABEL: {{^}}store_f32: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_dword + +define void @store_f32(float addrspace(0)* %out, float %in) { + store float %in, float addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i16: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x2? +; XSI: buffer_store_dwordx2 +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i16> + store <4 x i16> %0, <4 x i16> addrspace(0)* %out + ret void +} + +; vec2 floating-point stores +; FUNC-LABEL: {{^}}store_v2f32: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x2? +; XSI: buffer_store_dwordx2 +; SI: buffer_store_dword +; SI: buffer_store_dword + +define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { +entry: + %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 + %1 = insertelement <2 x float> %0, float %b, i32 1 + store <2 x float> %1, <2 x float> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v3i32: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x2? +; XSI-DAG: buffer_store_dwordx2 +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { + store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}store_v4i32: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x4? +; XSI: buffer_store_dwordx4 +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i32_unaligned: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x4? +; XSI: buffer_store_dwordx4 +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4 + ret void +} + +; v4f32 store +; FUNC-LABEL: {{^}}store_v4f32: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x4? +; XSI: buffer_store_dwordx4 +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { + %1 = load <4 x float>, <4 x float> addrspace(0) * %in + store <4 x float> %1, <4 x float> addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i8: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_byte +define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i8 + store i8 %0, i8 addrspace(0)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i16: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; SI: buffer_store_short +define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i16 + store i16 %0, i16 addrspace(0)* %out + ret void +} + +; The stores in this function are combined by the optimizer to create a +; 64-bit store with 32-bit alignment. This is legal and the legalizer +; should not try to split the 64-bit store back into 2 32-bit stores. + +; FUNC-LABEL: {{^}}vecload2: +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x2? +; XSI: buffer_store_dwordx2 +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +entry: + %0 = load i32, i32 addrspace(2)* %mem, align 4 + %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 + %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 + store i32 %0, i32 addrspace(0)* %out, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 + store i32 %1, i32 addrspace(0)* %arrayidx1, align 4 + ret void +} + +; When i128 was a legal type this program generated cannot select errors: + +; FUNC-LABEL: {{^}}"i128-const-store": +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, +; EG: MOVA_INT +; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, + +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, +; CM: MOVA_INT +; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, + +;TODO: why not x4? +; XSI: buffer_store_dwordx4 +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @i128-const-store(i32 addrspace(0)* %out) { +entry: + store i32 1, i32 addrspace(0)* %out, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 + store i32 1, i32 addrspace(0)* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 2 + store i32 2, i32 addrspace(0)* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 3 + store i32 2, i32 addrspace(0)* %arrayidx6, align 4 + ret void +} + + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AVR/intrinsics/read_register.ll b/test/CodeGen/AVR/intrinsics/read_register.ll new file mode 100644 index 000000000000..3f28d1d3a9fe --- /dev/null +++ b/test/CodeGen/AVR/intrinsics/read_register.ll @@ -0,0 +1,17 @@ +; RUN: llc -O0 < %s -march=avr | FileCheck %s + +; CHECK-LABEL: foo +define void @foo() { +entry: + %val1 = call i16 @llvm.read_register.i16(metadata !0) + %val2 = call i16 @llvm.read_register.i16(metadata !1) + %val3 = call i8 @llvm.read_register.i8(metadata !2) + ret void +} + +declare i8 @llvm.read_register.i8(metadata) +declare i16 @llvm.read_register.i16(metadata) + +!0 = !{!"r28"} +!1 = !{!"Z"} +!2 = !{!"r0"} diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll new file mode 100644 index 000000000000..49980da6eb8f --- /dev/null +++ b/test/CodeGen/WebAssembly/function-bitcasts.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -asm-verbose=false | FileCheck %s + +; Test that function pointer casts are replaced with wrappers. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +; CHECK-LABEL: test: +; CHECK-NEXT: call .Lbitcast@FUNCTION{{$}} +; CHECK-NEXT: call .Lbitcast.1@FUNCTION{{$}} +; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0 +; CHECK-NEXT: call .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}} +; CHECK-NEXT: i32.call $drop=, .Lbitcast.3@FUNCTION{{$}} +; CHECK-NEXT: call foo2@FUNCTION{{$}} +; CHECK-NEXT: call foo3@FUNCTION{{$}} +; CHECK-NEXT: .endfunc + +; CHECK-LABEL: .Lbitcast: +; CHECK-NEXT: .local i32 +; CHECK-NEXT: call has_i32_arg@FUNCTION, $0{{$}} +; CHECK-NEXT: .endfunc + +; CHECK-LABEL: .Lbitcast.1: +; CHECK-NEXT: call $drop=, has_i32_ret@FUNCTION{{$}} +; CHECK-NEXT: .endfunc + +; CHECK-LABEL: .Lbitcast.2: +; CHECK-NEXT: .param i32 +; CHECK-NEXT: call foo0@FUNCTION{{$}} +; CHECK-NEXT: .endfunc + +; CHECK-LABEL: .Lbitcast.3: +; CHECK-NEXT: .result i32 +; CHECK-NEXT: .local i32 +; CHECK-NEXT: call foo1@FUNCTION{{$}} +; CHECK-NEXT: copy_local $push0=, $0 +; CHECK-NEXT: .endfunc + +declare void @has_i32_arg(i32) +declare i32 @has_i32_ret() + +declare void @foo0() +declare void @foo1() +declare void @foo2() +declare void @foo3() + +define void @test() { +entry: + call void bitcast (void (i32)* @has_i32_arg to void ()*)() + call void bitcast (i32 ()* @has_i32_ret to void ()*)() + call void bitcast (void ()* @foo0 to void (i32)*)(i32 0) + %t = call i32 bitcast (void ()* @foo1 to i32 ()*)() + call void bitcast (void ()* @foo2 to void ()*)() + call void @foo3() + ret void +} diff --git a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll new file mode 100644 index 000000000000..ef4318ec299b --- /dev/null +++ b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -asm-verbose=false | FileCheck %s + +; Test that function pointer casts that require conversions are not converted +; to wrappers. In theory some conversions could be supported, but currently no +; conversions are implemented. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +; CHECK-LABEL: test: +; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}} +; CHECK-NEXT: call has_i64_arg@FUNCTION, $pop[[L0]]{{$}} +; CHECK-NEXT: i32.call $drop=, has_i64_ret@FUNCTION{{$}} +; CHECK-NEXT: .endfunc + +; CHECK-NOT: .Lbitcast + +declare void @has_i64_arg(i64) +declare i64 @has_i64_ret() + +define void @test() { +entry: + call void bitcast (void (i64)* @has_i64_arg to void (i32)*)(i32 0) + %t = call i32 bitcast (i64 ()* @has_i64_ret to i32 ()*)() + ret void +} diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll index e1341624cad3..aec74424b9b2 100644 --- a/test/CodeGen/X86/avx2-arith.ll +++ b/test/CodeGen/X86/avx2-arith.ll @@ -142,17 +142,108 @@ define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone ret <16 x i16> %x } -define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { +define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { +; X32-LABEL: mul_v16i8: +; X32: ## BB#0: +; X32-NEXT: vpmovsxbw %xmm1, %ymm1 +; X32-NEXT: vpmovsxbw %xmm0, %ymm0 +; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: mul_v16i8: +; X64: ## BB#0: +; X64-NEXT: vpmovsxbw %xmm1, %ymm1 +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vzeroupper +; X64-NEXT: retq %x = mul <16 x i8> %i, %j ret <16 x i8> %x } -define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +; X32-LABEL: mul_v32i8: +; X32: ## BB#0: +; X32-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X32-NEXT: vpmovsxbw %xmm2, %ymm2 +; X32-NEXT: vextracti128 $1, %ymm0, %xmm3 +; X32-NEXT: vpmovsxbw %xmm3, %ymm3 +; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; X32-NEXT: vextracti128 $1, %ymm2, %xmm3 +; X32-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X32-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; X32-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-NEXT: vpmovsxbw %xmm1, %ymm1 +; X32-NEXT: vpmovsxbw %xmm0, %ymm0 +; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; X32-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_v32i8: +; X64: ## BB#0: +; X64-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X64-NEXT: vpmovsxbw %xmm2, %ymm2 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm3 +; X64-NEXT: vpmovsxbw %xmm3, %ymm3 +; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; X64-NEXT: vextracti128 $1, %ymm2, %xmm3 +; X64-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X64-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: vpmovsxbw %xmm1, %ymm1 +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; X64-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64-NEXT: retq %x = mul <32 x i8> %i, %j ret <32 x i8> %x } -define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +; X32-LABEL: mul_v4i64: +; X32: ## BB#0: +; X32-NEXT: vpsrlq $32, %ymm0, %ymm2 +; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; X32-NEXT: vpsrlq $32, %ymm1, %ymm3 +; X32-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; X32-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; X32-NEXT: vpsllq $32, %ymm2, %ymm2 +; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: mul_v4i64: +; X64: ## BB#0: +; X64-NEXT: vpsrlq $32, %ymm0, %ymm2 +; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 +; X64-NEXT: vpsrlq $32, %ymm1, %ymm3 +; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; X64-NEXT: vpsllq $32, %ymm2, %ymm2 +; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-NEXT: retq %x = mul <4 x i64> %i, %j ret <4 x i64> %x } @@ -291,8 +382,8 @@ define <8 x i32> @mul_const9(<8 x i32> %x) { ret <8 x i32> %y } +; %x * 0x01010101 define <4 x i32> @mul_const10(<4 x i32> %x) { - ; %x * 0x01010101 ; X32-LABEL: mul_const10: ; X32: ## BB#0: ; X32-NEXT: vpbroadcastd LCPI22_0, %xmm1 @@ -308,8 +399,8 @@ define <4 x i32> @mul_const10(<4 x i32> %x) { ret <4 x i32> %m } +; %x * 0x80808080 define <4 x i32> @mul_const11(<4 x i32> %x) { - ; %x * 0x80808080 ; X32-LABEL: mul_const11: ; X32: ## BB#0: ; X32-NEXT: vpbroadcastd LCPI23_0, %xmm1 diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll index 0dcfb7c169f3..e66eefdb8e9f 100644 --- a/test/CodeGen/X86/avx512-bugfix-23634.ll +++ b/test/CodeGen/X86/avx512-bugfix-23634.ll @@ -15,7 +15,7 @@ define void @f_fu(float* %ret, float* %aa, float %b) { ; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2 ; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd {{.*}}(%rip), %zmm1, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 {%k1} ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 532678ae72fa..1a91bc1dee9a 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -25,8 +25,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -48,8 +47,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b @@ -65,8 +63,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -88,8 +85,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b @@ -180,8 +176,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL-NEXT: Lcfi1: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: callq _func16xi1 ; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -210,8 +205,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL_X32-NEXT: Lcfi1: ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: calll _func16xi1 ; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -285,8 +279,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: movb $85, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq @@ -322,8 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: movb $85, %al ; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index c2eb19d16650..5e50a3aef2f2 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -740,8 +740,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; KNL-NEXT: retq ; @@ -805,11 +804,10 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) { ; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; KNL-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm1, %ymm1 ; KNL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; KNL-NEXT: retq @@ -834,8 +832,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; KNL-NEXT: retq @@ -858,8 +855,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) { ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 32bd0804d637..03d6127ae5dc 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -345,9 +345,9 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -369,9 +369,9 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxbd (%rdi), %ymm0 -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxbd (%rdi), %ymm1 +; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -704,9 +704,9 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -728,9 +728,9 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxwd (%rdi), %ymm0 -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxwd (%rdi), %ymm1 +; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -762,9 +762,9 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -1457,8 +1457,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; KNL-LABEL: sext_16i1_16i32: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: sext_16i1_16i32: diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 26d14fa0840f..cb8ed0e59a3a 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -365,11 +365,10 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpslld $31, %zmm2, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -402,11 +401,10 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1242,30 +1240,29 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: vpextrd $1, %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 -; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 -; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k2 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpextrd $3, %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -1306,11 +1303,10 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index d48f63536e0e..b127585dc87b 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -344,8 +344,7 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB17_1: ; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 ; KNL-NEXT: LBB17_3: -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -382,8 +381,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB18_3: ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -472,8 +470,7 @@ define <16 x i1> @test15(i32 %x, i32 %y) { ; KNL-NEXT: movw $1, %cx ; KNL-NEXT: cmovgw %ax, %cx ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -510,28 +507,27 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; KNL-NEXT: movl $1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; KNL-NEXT: vpsllw $7, %ymm2, %ymm0 -; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -574,30 +570,29 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al ; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -635,18 +630,17 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kshiftlw $6, %k2, %k2 ; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: kshiftlw $7, %k0, %k0 ; KNL-NEXT: korw %k0, %k1, %k1 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -1387,8 +1381,7 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_8i1: @@ -1405,8 +1398,7 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) { ; KNL-LABEL: load_16i1: ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_16i1: @@ -1424,8 +1416,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq ; @@ -1444,8 +1435,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; KNL-NEXT: retq @@ -1465,10 +1455,9 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: kmovw 2(%rdi), %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 ; KNL-NEXT: retq ; @@ -1489,17 +1478,16 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; KNL-NEXT: kmovw 2(%rdi), %k2 ; KNL-NEXT: kmovw 4(%rdi), %k3 ; KNL-NEXT: kmovw 6(%rdi), %k4 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k4} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: load_64i1: diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll index 2a0de05608b4..9234ae838cff 100644 --- a/test/CodeGen/X86/avx512-mov.ll +++ b/test/CodeGen/X86/avx512-mov.ll @@ -313,7 +313,7 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07] +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* @@ -327,7 +327,7 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07] +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* @@ -369,7 +369,7 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* @@ -383,7 +383,7 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* @@ -426,7 +426,7 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* @@ -441,7 +441,7 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* @@ -486,7 +486,7 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07] +; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* @@ -501,7 +501,7 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] ; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07] +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll index ce8fca036c91..a29c1e4628a1 100644 --- a/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -325,11 +325,13 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) { } ; X32-LABEL: test_argRet128Vector: -; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0 +; X32: vmovdqa{{.*}} %xmm0, %xmm1 +; X32: vmovdqa{{.*}} %xmm1, %xmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet128Vector: -; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0 +; WIN64: vmovdqa{{.*}} %xmm0, %xmm1 +; WIN64: vmovdqa{{.*}} %xmm1, %xmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 128 bit vector @@ -341,13 +343,13 @@ define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b) ; X32-LABEL: test_CallargRet128Vector: ; X32: vmov{{.*}} %xmm0, {{%xmm([0-7])}} ; X32: call{{.*}} {{.*}}test_argRet128Vector -; X32: vpblend{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0 +; X32: vmovdqa{{.*}} {{%xmm([0-7])}}, %xmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_CallargRet128Vector: ; WIN64: vmov{{.*}} %xmm0, {{%xmm([0-9]+)}} ; WIN64: call{{.*}} {{.*}}test_argRet128Vector -; WIN64: vpblend{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0 +; WIN64: vmovdqa{{.*}} {{%xmm([0-9]+)}}, %xmm0 ; WIN64: ret{{.*}} ; Test regcall when passing/retrieving 128 bit vector @@ -358,11 +360,13 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) { } ; X32-LABEL: test_argRet256Vector: -; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0 +; X32: vmovdqa{{.*}} %ymm0, %ymm1 +; X32: vmovdqa{{.*}} %ymm1, %ymm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet256Vector: -; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0 +; WIN64: vmovdqa{{.*}} %ymm0, %ymm1 +; WIN64: vmovdqa{{.*}} %ymm1, %ymm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 256 bit vector @@ -374,13 +378,13 @@ define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b) ; X32-LABEL: test_CallargRet256Vector: ; X32: vmov{{.*}} %ymm0, %ymm1 ; X32: call{{.*}} {{.*}}test_argRet256Vector -; X32: vpblend{{.*}} %ymm1, %ymm0, %ymm0 +; X32: vmovdqa{{.*}} %ymm1, %ymm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_CallargRet256Vector: ; WIN64: vmov{{.*}} %ymm0, %ymm1 ; WIN64: call{{.*}} {{.*}}test_argRet256Vector -; WIN64: vpblend{{.*}} %ymm1, %ymm0, %ymm0 +; WIN64: vmovdqa{{.*}} %ymm1, %ymm0 ; WIN64: ret{{.*}} ; Test regcall when passing/retrieving 256 bit vector @@ -391,11 +395,13 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) { } ; X32-LABEL: test_argRet512Vector: -; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0 +; X32: vmovdqa{{.*}} %zmm0, %zmm1 +; X32: vmovdqa{{.*}} %zmm1, %zmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet512Vector: -; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0 +; WIN64: vmovdqa{{.*}} %zmm0, %zmm1 +; WIN64: vmovdqa{{.*}} %zmm1, %zmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 512 bit vector @@ -407,13 +413,13 @@ define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> ; X32-LABEL: test_CallargRet512Vector: ; X32: vmov{{.*}} %zmm0, %zmm1 ; X32: call{{.*}} {{.*}}test_argRet512Vector -; X32: vpblend{{.*}} %zmm1, %zmm0, %zmm0 +; X32: movdqa{{.*}} %zmm1, %zmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_CallargRet512Vector: ; WIN64: vmov{{.*}} %zmm0, %zmm1 ; WIN64: call{{.*}} {{.*}}test_argRet512Vector -; WIN64: vpblend{{.*}} %zmm1, %zmm0, %zmm0 +; WIN64: vmovdqa{{.*}} %zmm1, %zmm0 ; WIN64: ret{{.*}} ; Test regcall when passing/retrieving 512 bit vector diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 840239b9011a..1991ee4f3376 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -218,8 +218,7 @@ define <16 x i32> @test_vbroadcast() { ; ALL: # BB#0: # %entry ; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1 -; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: knotw %k1, %k1 ; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index bd269ea87a35..361ee1ddbf9d 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -6,7 +6,8 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 -; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y @@ -17,7 +18,8 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 -; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y @@ -28,7 +30,8 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y @@ -40,7 +43,8 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) ; CHECK-LABEL: test4_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -51,7 +55,8 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y @@ -62,7 +67,8 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun ; CHECK-LABEL: test6_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y @@ -81,7 +87,8 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) { ; SKX: ## BB#0: ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1 -; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq %mask = fcmp olt <4 x float> %a, zeroinitializer @@ -101,7 +108,8 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { ; SKX: ## BB#0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1 -; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 ; SKX-NEXT: retq %mask = fcmp olt <2 x double> %a, zeroinitializer %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b @@ -114,14 +122,15 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { ; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovdqa %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test9: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovdqa %ymm1, %ymm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y @@ -134,14 +143,15 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { ; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test10: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1 -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 ; SKX-NEXT: retq %mask = fcmp oeq <8 x float> %x, %y @@ -658,9 +668,9 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { ; CHECK-LABEL: test14: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm2, %k1 -; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %sub_r = sub <16 x i32> %a, %b %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a @@ -673,9 +683,9 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { ; CHECK-LABEL: test15: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm2, %k1 -; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %sub_r = sub <8 x i64> %a, %b %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a @@ -689,7 +699,8 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind ; CHECK-LABEL: test16: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp sge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -700,7 +711,8 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sgt <16 x i32> %x, %y @@ -712,7 +724,8 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sle <16 x i32> %x, %y @@ -724,7 +737,8 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-LABEL: test19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp ule <16 x i32> %x, %y @@ -737,7 +751,8 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i32> %x1, %y1 %mask0 = icmp eq <16 x i32> %x, %y @@ -751,7 +766,8 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %mask0 = icmp sle <8 x i64> %x, %y @@ -765,7 +781,8 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <8 x i64> %x1, %y1 %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4 @@ -780,7 +797,8 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 @@ -794,7 +812,8 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { ; CHECK-LABEL: test24: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 @@ -808,7 +827,8 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind ; CHECK-LABEL: test25: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 @@ -823,7 +843,8 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -840,7 +861,8 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -858,8 +880,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ; KNL-NEXT: kxnorw %k1, %k0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: retq ; @@ -883,8 +904,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -912,7 +932,8 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind { ; SKX-LABEL: test30: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 ; SKX-NEXT: retq %mask = fcmp oeq <4 x double> %x, %y @@ -930,7 +951,8 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp ; SKX-LABEL: test31: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1 -; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 ; SKX-NEXT: retq %y = load <2 x double>, <2 x double>* %yp, align 4 @@ -949,7 +971,8 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp ; SKX-LABEL: test32: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1 -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 ; SKX-NEXT: retq %y = load <4 x double>, <4 x double>* %yp, align 4 @@ -962,7 +985,8 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp ; CHECK-LABEL: test33: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <8 x double>, <8 x double>* %yp, align 4 %mask = fcmp olt <8 x double> %x, %y @@ -980,7 +1004,8 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no ; SKX-LABEL: test34: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1 -; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq %y = load <4 x float>, <4 x float>* %yp, align 4 %mask = fcmp olt <4 x float> %x, %y @@ -995,14 +1020,15 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vmovups (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 -; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test35: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1 -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 ; SKX-NEXT: retq %y = load <8 x float>, <8 x float>* %yp, align 4 @@ -1015,7 +1041,8 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp ; CHECK-LABEL: test36: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1 -; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <16 x float>, <16 x float>* %yp, align 4 %mask = fcmp olt <16 x float> %x, %y @@ -1027,7 +1054,8 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou ; CHECK-LABEL: test37: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %a = load double, double* %ptr @@ -1050,7 +1078,8 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou ; SKX-LABEL: test38: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1 -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1073,7 +1102,8 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou ; SKX-LABEL: test39: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1 -; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1090,7 +1120,8 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n ; CHECK-LABEL: test40: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %a = load float, float* %ptr @@ -1109,14 +1140,15 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vbroadcastss (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 -; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test41: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1 -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1139,7 +1171,8 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun ; SKX-LABEL: test42: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1 -; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1158,7 +1191,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovapd %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test43: @@ -1166,7 +1200,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; SKX-NEXT: vpsllw $15, %xmm2, %xmm2 ; SKX-NEXT: vpmovw2m %xmm2, %k1 ; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 ; SKX-NEXT: retq %a = load double, double* %ptr diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll index c58b3cc8c3cd..11bb431414a0 100644 --- a/test/CodeGen/X86/avx512bw-mov.ll +++ b/test/CodeGen/X86/avx512bw-mov.ll @@ -26,7 +26,7 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpblendmb (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <64 x i8> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <64 x i8>* @@ -74,7 +74,7 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpblendmw (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <32 x i16> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <32 x i16>* diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll index 016837e61307..34432468921b 100644 --- a/test/CodeGen/X86/avx512bw-vec-cmp.ll +++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -5,7 +5,8 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y @@ -16,7 +17,8 @@ define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp sgt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -27,7 +29,8 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp sge <32 x i16> %x, %y %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y @@ -38,7 +41,8 @@ define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { ; CHECK-LABEL: test4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = icmp ugt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -49,7 +53,8 @@ define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwin ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %yp, align 4 %mask = icmp eq <32 x i16> %x, %y @@ -61,7 +66,8 @@ define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sgt <32 x i16> %x, %y @@ -73,7 +79,8 @@ define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sle <32 x i16> %x, %y @@ -85,7 +92,8 @@ define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun ; CHECK-LABEL: test8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp ule <32 x i16> %x, %y @@ -98,7 +106,8 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp eq <32 x i16> %x1, %y1 %mask0 = icmp eq <32 x i16> %x, %y @@ -112,7 +121,8 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <64 x i8> %x1, %y1 %mask0 = icmp sle <64 x i8> %x, %y @@ -126,7 +136,8 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <64 x i8> %x1, %y1 %y = load <64 x i8>, <64 x i8>* %y.ptr, align 4 @@ -141,7 +152,8 @@ define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i16> %x1, %y1 %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll index 209f18ba7f9c..3f92641a3e16 100644 --- a/test/CodeGen/X86/avx512bwvl-mov.ll +++ b/test/CodeGen/X86/avx512bwvl-mov.ll @@ -26,7 +26,7 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04] -; CHECK-NEXT: vpblendmb (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x66,0x07] +; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <32 x i8> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <32 x i8>* @@ -74,7 +74,7 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04] -; CHECK-NEXT: vpblendmw (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x66,0x07] +; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i16> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i16>* @@ -122,7 +122,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04] -; CHECK-NEXT: vpblendmb (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x66,0x07] +; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i8> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i8>* @@ -170,7 +170,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04] -; CHECK-NEXT: vpblendmw (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x66,0x07] +; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i16> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i16>* diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll index 17e581bbb501..3e7f0acae78b 100644 --- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll +++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -5,7 +5,8 @@ define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp eq <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y @@ -16,7 +17,8 @@ define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %mask = icmp sgt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -27,7 +29,8 @@ define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounw ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp sge <16 x i16> %x, %y %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y @@ -38,7 +41,8 @@ define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %mask = icmp ugt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -49,7 +53,8 @@ define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nou ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %yp, align 4 %mask = icmp eq <16 x i16> %x, %y @@ -61,7 +66,8 @@ define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sgt <16 x i16> %x, %y @@ -73,7 +79,8 @@ define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sle <16 x i16> %x, %y @@ -85,7 +92,8 @@ define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp ule <16 x i16> %x, %y @@ -98,7 +106,8 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i16> %x1, %y1 %mask0 = icmp eq <16 x i16> %x, %y @@ -112,7 +121,8 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i8> %x1, %y1 %mask0 = icmp sle <32 x i8> %x, %y @@ -126,7 +136,8 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <32 x i8> %x1, %y1 %y = load <32 x i8>, <32 x i8>* %y.ptr, align 4 @@ -141,7 +152,8 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i16> %x1, %y1 %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 @@ -155,7 +167,8 @@ define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp eq <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y @@ -166,7 +179,8 @@ define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %mask = icmp sgt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -177,7 +191,8 @@ define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp sge <8 x i16> %x, %y %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y @@ -188,7 +203,8 @@ define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %mask = icmp ugt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -199,7 +215,8 @@ define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwin ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %yp, align 4 %mask = icmp eq <8 x i16> %x, %y @@ -211,7 +228,8 @@ define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sgt <8 x i16> %x, %y @@ -223,7 +241,8 @@ define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sle <8 x i16> %x, %y @@ -235,7 +254,8 @@ define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp ule <8 x i16> %x, %y @@ -248,7 +268,8 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i16> %x1, %y1 %mask0 = icmp eq <8 x i16> %x, %y @@ -262,7 +283,8 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i8> %x1, %y1 %mask0 = icmp sle <16 x i8> %x, %y @@ -276,7 +298,8 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <16 x i8> %x1, %y1 %y = load <16 x i8>, <16 x i8>* %y.ptr, align 4 @@ -291,7 +314,8 @@ define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i16> %x1, %y1 %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll index e37fd76377e3..af449d6628c4 100644 --- a/test/CodeGen/X86/avx512vl-mov.ll +++ b/test/CodeGen/X86/avx512vl-mov.ll @@ -166,7 +166,7 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07] +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i32>* @@ -180,7 +180,7 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07] +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i32>* @@ -222,7 +222,7 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x i64>* @@ -236,7 +236,7 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x i64>* @@ -279,7 +279,7 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07] +; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x float>* @@ -294,7 +294,7 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x float>* @@ -338,7 +338,7 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1 ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07] +; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x double>* @@ -352,7 +352,7 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1 ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07] +; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x double>* @@ -554,7 +554,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07] +; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x i32>* @@ -568,7 +568,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07] +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x i32>* @@ -610,7 +610,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <2 x i64>* @@ -624,7 +624,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <2 x i64>* @@ -666,7 +666,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07] +; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x float>* @@ -680,7 +680,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x float>* @@ -722,7 +722,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1 ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07] +; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <2 x double>* @@ -736,7 +736,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1 ; CHECK: ## BB#0: ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] ; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] -; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07] +; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <2 x double>* diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll index e0acf2be653e..25b9cc79096f 100644 --- a/test/CodeGen/X86/avx512vl-vec-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -5,7 +5,8 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp eq <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y @@ -16,7 +17,8 @@ define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp sgt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -27,7 +29,8 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp sge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y @@ -38,7 +41,8 @@ define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = icmp ugt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -49,7 +53,8 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %x, %y @@ -61,7 +66,8 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %y, %x @@ -73,7 +79,8 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sgt <8 x i32> %x, %y @@ -85,7 +92,8 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp slt <8 x i32> %y, %x @@ -97,7 +105,8 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sle <8 x i32> %x, %y @@ -109,7 +118,8 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sge <8 x i32> %y, %x @@ -121,7 +131,8 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp ule <8 x i32> %x, %y @@ -133,7 +144,8 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou ; CHECK-LABEL: test256_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -146,7 +158,8 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i32> %x1, %y1 %mask0 = icmp eq <8 x i32> %x, %y @@ -160,7 +173,8 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %mask0 = icmp sle <4 x i64> %x, %y @@ -174,7 +188,8 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <4 x i64> %x1, %y1 %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4 @@ -189,7 +204,8 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 @@ -203,7 +219,8 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind ; CHECK-LABEL: test256_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 @@ -217,7 +234,8 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind ; CHECK-LABEL: test256_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 @@ -232,7 +250,8 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -249,7 +268,8 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -265,7 +285,8 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %x, %y @@ -277,7 +298,8 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %y, %x @@ -289,7 +311,8 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %x, %y @@ -301,7 +324,8 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi ; CHECK-LABEL: test256_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -313,7 +337,8 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp eq <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y @@ -324,7 +349,8 @@ define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp sgt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -335,7 +361,8 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp sge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y @@ -346,7 +373,8 @@ define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = icmp ugt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -357,7 +385,8 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %x, %y @@ -369,7 +398,8 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi ; CHECK-LABEL: test128_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %y, %x @@ -381,7 +411,8 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sgt <4 x i32> %x, %y @@ -393,7 +424,8 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp slt <4 x i32> %y, %x @@ -405,7 +437,8 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sle <4 x i32> %x, %y @@ -417,7 +450,8 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sge <4 x i32> %y, %x @@ -429,7 +463,8 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ule <4 x i32> %x, %y @@ -441,7 +476,8 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x @@ -454,7 +490,8 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp eq <4 x i32> %x1, %y1 %mask0 = icmp eq <4 x i32> %x, %y @@ -468,7 +505,8 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %mask0 = icmp sle <2 x i64> %x, %y @@ -482,7 +520,8 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sgt <2 x i64> %x1, %y1 %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4 @@ -497,7 +536,8 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 @@ -511,7 +551,8 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind ; CHECK-LABEL: test128_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 @@ -525,7 +566,8 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind ; CHECK-LABEL: test128_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 @@ -540,7 +582,8 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -557,7 +600,8 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -573,7 +617,8 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %x, %y @@ -585,7 +630,8 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %y, %x @@ -597,7 +643,8 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %x, %y @@ -609,7 +656,8 @@ define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou ; CHECK-LABEL: test128_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll index 8e9bc8b5af4b..0060539c691f 100644 --- a/test/CodeGen/X86/cmov.ll +++ b/test/CodeGen/X86/cmov.ll @@ -157,16 +157,12 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind { ret i8 %d } -; FIXME: The 'not' is redundant. - define i32 @smin(i32 %x) { ; CHECK-LABEL: smin: ; CHECK: ## BB#0: -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: notl %ecx ; CHECK-NEXT: xorl $-1, %edi ; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovsl %ecx, %eax +; CHECK-NEXT: cmovsl %edi, %eax ; CHECK-NEXT: retq %not_x = xor i32 %x, -1 %1 = icmp slt i32 %not_x, -1 diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll index 5636a5bcd73e..5329f5b216a4 100644 --- a/test/CodeGen/X86/fma-fneg-combine.ll +++ b/test/CodeGen/X86/fma-fneg-combine.ll @@ -222,9 +222,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i ; SKX-NEXT: kmovw %edi, %k1 ; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3 ; SKX-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; SKX-NEXT: vblendmps %zmm1, %zmm3, %zmm1 {%k1} -; SKX-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovaps %zmm1, %zmm3 {%k1} +; SKX-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1} +; SKX-NEXT: vmovaps %zmm3, %zmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test15: @@ -232,9 +232,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm3 ; KNL-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; KNL-NEXT: vblendmps %zmm1, %zmm3, %zmm1 {%k1} -; KNL-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1} -; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: vmovaps %zmm1, %zmm3 {%k1} +; KNL-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1} +; KNL-NEXT: vmovaps %zmm3, %zmm0 ; KNL-NEXT: retq entry: %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a diff --git a/test/CodeGen/X86/fmaddsub-combine.ll b/test/CodeGen/X86/fmaddsub-combine.ll new file mode 100644 index 000000000000..f3b13cd053b4 --- /dev/null +++ b/test/CodeGen/X86/fmaddsub-combine.ll @@ -0,0 +1,129 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s + +; This test checks the fusing of MUL + ADDSUB to FMADDSUB. + +define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { +; FMA3-LABEL: mul_addsub_pd128: +; FMA3: # BB#0: # %entry +; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: mul_addsub_pd128: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %AB = fmul <2 x double> %A, %B + %Sub = fsub <2 x double> %AB, %C + %Add = fadd <2 x double> %AB, %C + %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3> + ret <2 x double> %Addsub +} + +define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { +; FMA3-LABEL: mul_addsub_ps128: +; FMA3: # BB#0: # %entry +; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: mul_addsub_ps128: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %AB = fmul <4 x float> %A, %B + %Sub = fsub <4 x float> %AB, %C + %Add = fadd <4 x float> %AB, %C + %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %Addsub +} + +define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { +; FMA3-LABEL: mul_addsub_pd256: +; FMA3: # BB#0: # %entry +; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: mul_addsub_pd256: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 +; FMA4-NEXT: retq +entry: + %AB = fmul <4 x double> %A, %B + %Sub = fsub <4 x double> %AB, %C + %Add = fadd <4 x double> %AB, %C + %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x double> %Addsub +} + +define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { +; FMA3-LABEL: mul_addsub_ps256: +; FMA3: # BB#0: # %entry +; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: mul_addsub_ps256: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 +; FMA4-NEXT: retq +entry: + %AB = fmul <8 x float> %A, %B + %Sub = fsub <8 x float> %AB, %C + %Add = fadd <8 x float> %AB, %C + %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> + ret <8 x float> %Addsub +} + +define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { +; FMA3_256-LABEL: mul_addsub_pd512: +; FMA3_256: # BB#0: # %entry +; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1 +; FMA3_256-NEXT: retq +; +; FMA3_512-LABEL: mul_addsub_pd512: +; FMA3_512: # BB#0: # %entry +; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 +; FMA3_512-NEXT: retq +; +; FMA4-LABEL: mul_addsub_pd512: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +entry: + %AB = fmul <8 x double> %A, %B + %Sub = fsub <8 x double> %AB, %C + %Add = fadd <8 x double> %AB, %C + %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> + ret <8 x double> %Addsub +} + +define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { +; FMA3_256-LABEL: mul_addsub_ps512: +; FMA3_256: # BB#0: # %entry +; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1 +; FMA3_256-NEXT: retq +; +; FMA3_512-LABEL: mul_addsub_ps512: +; FMA3_512: # BB#0: # %entry +; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 +; FMA3_512-NEXT: retq +; +; FMA4-LABEL: mul_addsub_ps512: +; FMA4: # BB#0: # %entry +; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +entry: + %AB = fmul <16 x float> %A, %B + %Sub = fsub <16 x float> %AB, %C + %Add = fadd <16 x float> %AB, %C + %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> + ret <16 x float> %Addsub +} + +attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll index 7159d4c87174..32594a27698d 100644 --- a/test/CodeGen/X86/sse-fsignum.ll +++ b/test/CodeGen/X86/sse-fsignum.ll @@ -93,15 +93,14 @@ define void @signum32b(<8 x float>*) { ; AVX512F-NEXT: vmovaps (%rdi), %ymm0 ; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512F-NEXT: vcvtdq2ps %ymm3, %ymm3 +; AVX512F-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) ; AVX512F-NEXT: retq entry: diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index abe3da752874..c34f333ef785 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -4,6 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; ; 128-bit vector comparisons @@ -308,12 +310,26 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v16i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i16> %a0, %a1 ret <16 x i1> %1 } @@ -589,13 +605,26 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v8f64: -; AVX512: # BB#0: -; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v8f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v8f64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v8f64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 ret <8 x i1> %1 } @@ -636,13 +665,26 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v16f32: -; AVX512: # BB#0: -; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v16f32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v16f32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v16f32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 ret <16 x i1> %1 } @@ -734,13 +776,26 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v8i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v8i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v8i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v8i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 ret <8 x i1> %1 } @@ -784,13 +839,26 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v16i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v16i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v16i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v16i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 ret <16 x i1> %1 } @@ -1045,16 +1113,35 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v32i16: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 -; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v32i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <32 x i16> %a0, %a1 ret <32 x i1> %1 } @@ -1874,15 +1961,31 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v64i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512-NEXT: vmovdqa %xmm4, %xmm2 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512F-NEXT: vmovdqa %xmm4, %xmm2 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v64i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <64 x i8> %a0, %a1 ret <64 x i1> %1 } @@ -1957,120 +2060,350 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v16f64: -; AVX512: # BB#0: -; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm5 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movq $-1, %rcx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512-NEXT: vucomisd %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm7 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512-NEXT: vucomisd %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5 -; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm6 -; AVX512-NEXT: vucomisd %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm7 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512-NEXT: vucomisd %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512-NEXT: vucomisd %xmm2, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vucomisd %xmm2, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4 -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5 -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4 -; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5 -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512-NEXT: vucomisd %xmm3, %xmm1 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512-NEXT: vucomisd %xmm3, %xmm1 -; AVX512-NEXT: cmovaq %rcx, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v16f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm5 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movq $-1, %rcx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vucomisd %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm7 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm6 +; AVX512F-NEXT: vucomisd %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm7 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512F-NEXT: vucomisd %xmm2, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vucomisd %xmm2, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm4 +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm5 +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4 +; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512F-NEXT: vucomisd %xmm3, %xmm1 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vucomisd %xmm3, %xmm1 +; AVX512F-NEXT: cmovaq %rcx, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v16f64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm5 +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movq $-1, %rcx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm6 +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm7 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm5 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm6 +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm7 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm4 +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm5 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm5 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1 +; AVX512DQ-NEXT: cmovaq %rcx, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v16f64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm5 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movq $-1, %rcx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm7 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm6 +; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm7 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512BW-NEXT: vucomisd %xmm2, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BW-NEXT: vucomisd %xmm2, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm4 +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm5 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512BW-NEXT: vucomisd %xmm3, %xmm1 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512BW-NEXT: vucomisd %xmm3, %xmm1 +; AVX512BW-NEXT: cmovaq %rcx, %rax +; AVX512BW-NEXT: vmovq %rax, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x double> %a0, %a1 ret <16 x i1> %1 } @@ -2416,207 +2749,612 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v32f32: -; AVX512: # BB#0: -; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm6 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vucomiss %xmm5, %xmm7 -; AVX512-NEXT: movl $-1, %ecx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm4, %xmm6 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] -; AVX512-NEXT: vucomiss %xmm7, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5 -; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm4, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8 -; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm7 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm6, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm5, %xmm7 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] -; AVX512-NEXT: vucomiss %xmm6, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 -; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8 -; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm7 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm6, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm5, %xmm7 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] -; AVX512-NEXT: vucomiss %xmm6, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 -; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm2, %xmm0 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512-NEXT: vucomiss %xmm6, %xmm7 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm2, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm8 -; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm5 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm4, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm2, %xmm5 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] -; AVX512-NEXT: vucomiss %xmm6, %xmm7 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm2, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2 -; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm5, %xmm7 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm4, %xmm6 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0] -; AVX512-NEXT: vucomiss %xmm7, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm4, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm0, %xmm5 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] -; AVX512-NEXT: vucomiss %xmm6, %xmm7 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm0, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX512-NEXT: vucomiss %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vucomiss %xmm3, %xmm1 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmoval %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm4 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] -; AVX512-NEXT: vucomiss %xmm5, %xmm6 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmoval %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512-NEXT: vucomiss %xmm3, %xmm1 -; AVX512-NEXT: cmoval %ecx, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v32f32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm6 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: vucomiss %xmm5, %xmm7 +; AVX512F-NEXT: movl $-1, %ecx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm4, %xmm6 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm5 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512F-NEXT: vucomiss %xmm7, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm4, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8 +; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm7 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm6, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm5, %xmm7 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm4 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512F-NEXT: vucomiss %xmm6, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8 +; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm7 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm6, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm5, %xmm7 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm4 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512F-NEXT: vucomiss %xmm6, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm2, %xmm0 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm5 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512F-NEXT: vucomiss %xmm6, %xmm7 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm2, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm5 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm4, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm2, %xmm5 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm4 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512F-NEXT: vucomiss %xmm6, %xmm7 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm2, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2 +; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm6 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm5, %xmm7 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm4, %xmm6 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm5 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0] +; AVX512F-NEXT: vucomiss %xmm7, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm4, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm0, %xmm5 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm4 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512F-NEXT: vucomiss %xmm6, %xmm7 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm0, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX512F-NEXT: vucomiss %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmoval %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm4 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] +; AVX512F-NEXT: vucomiss %xmm5, %xmm6 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmoval %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: cmoval %ecx, %eax +; AVX512F-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v32f32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm0, %xmm6 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7 +; AVX512DQ-NEXT: movl $-1, %ecx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm5 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512DQ-NEXT: vucomiss %xmm7, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, %xmm7 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm0, %xmm7 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm5 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm1, %xmm5 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm1, %xmm6 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm5 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0] +; AVX512DQ-NEXT: vucomiss %xmm7, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm3, %xmm0 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmoval %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmoval %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: cmoval %ecx, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v32f32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm6 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: vucomiss %xmm5, %xmm7 +; AVX512BW-NEXT: movl $-1, %ecx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512BW-NEXT: vucomiss %xmm7, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm7 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm5, %xmm7 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm7 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm5, %xmm7 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm7 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm5 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm2, %xmm5 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm7 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm2, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm6 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm7 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0] +; AVX512BW-NEXT: vucomiss %xmm7, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm0 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm7 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 +; AVX512BW-NEXT: cmoval %ecx, %eax +; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %1 = fcmp ogt <32 x float> %a0, %a1 ret <32 x i1> %1 } @@ -2785,136 +3523,398 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v16i64: -; AVX512: # BB#0: -; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: cmpq %rcx, %rdx -; AVX512-NEXT: movq $-1, %rcx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6 -; AVX512-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm7 -; AVX512-NEXT: vmovq %xmm5, %rdx -; AVX512-NEXT: vmovq %xmm6, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; AVX512-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm7 -; AVX512-NEXT: vmovq %xmm5, %rdx -; AVX512-NEXT: vmovq %xmm6, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vmovq %xmm2, %rdx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm2, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm6 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm3, %rdx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: cmovgq %rcx, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v16i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rcx +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: cmpq %rcx, %rdx +; AVX512F-NEXT: movq $-1, %rcx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm7 +; AVX512F-NEXT: vmovq %xmm5, %rdx +; AVX512F-NEXT: vmovq %xmm6, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm7 +; AVX512F-NEXT: vmovq %xmm5, %rdx +; AVX512F-NEXT: vmovq %xmm6, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vmovq %xmm2, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm2, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm6 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm3, %rdx +; AVX512F-NEXT: vmovq %xmm1, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: cmovgq %rcx, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v16i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rcx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: cmpq %rcx, %rdx +; AVX512DQ-NEXT: movq $-1, %rcx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm6 +; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm7 +; AVX512DQ-NEXT: vmovq %xmm5, %rdx +; AVX512DQ-NEXT: vmovq %xmm6, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm6 +; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm7 +; AVX512DQ-NEXT: vmovq %xmm5, %rdx +; AVX512DQ-NEXT: vmovq %xmm6, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm2, %rdx +; AVX512DQ-NEXT: vmovq %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm2, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm3, %rdx +; AVX512DQ-NEXT: vmovq %xmm1, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: cmovgq %rcx, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v16i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rcx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: cmpq %rcx, %rdx +; AVX512BW-NEXT: movq $-1, %rcx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm7 +; AVX512BW-NEXT: vmovq %xmm5, %rdx +; AVX512BW-NEXT: vmovq %xmm6, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm7 +; AVX512BW-NEXT: vmovq %xmm5, %rdx +; AVX512BW-NEXT: vmovq %xmm6, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vmovq %xmm2, %rdx +; AVX512BW-NEXT: vmovq %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm2, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm6 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm3, %rdx +; AVX512BW-NEXT: vmovq %xmm1, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmovgq %rcx, %rax +; AVX512BW-NEXT: vmovq %rax, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i64> %a0, %a1 ret <16 x i1> %1 } @@ -3252,223 +4252,660 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v32i32: -; AVX512: # BB#0: -; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vpextrd $1, %xmm4, %ecx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm5, %edx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: cmpl %ecx, %edx -; AVX512-NEXT: movl $-1, %ecx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: vmovd %xmm5, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm6 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $2, %xmm4, %edx -; AVX512-NEXT: vpextrd $2, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $3, %xmm4, %edx -; AVX512-NEXT: vpextrd $3, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 -; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm5, %edx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6 -; AVX512-NEXT: vpextrd $1, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm5, %esi -; AVX512-NEXT: vmovd %xmm6, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm7 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 -; AVX512-NEXT: vpextrd $2, %xmm5, %edx -; AVX512-NEXT: vpextrd $2, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 -; AVX512-NEXT: vpextrd $3, %xmm5, %edx -; AVX512-NEXT: vpextrd $3, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm5, %edx -; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; AVX512-NEXT: vpextrd $1, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm5, %esi -; AVX512-NEXT: vmovd %xmm6, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm7 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 -; AVX512-NEXT: vpextrd $2, %xmm5, %edx -; AVX512-NEXT: vpextrd $2, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 -; AVX512-NEXT: vpextrd $3, %xmm5, %edx -; AVX512-NEXT: vpextrd $3, %xmm6, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm2, %edx -; AVX512-NEXT: vpextrd $1, %xmm0, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm2, %esi -; AVX512-NEXT: vmovd %xmm0, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm6 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $2, %xmm2, %edx -; AVX512-NEXT: vpextrd $2, %xmm0, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $3, %xmm2, %edx -; AVX512-NEXT: vpextrd $3, %xmm0, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vpextrd $1, %xmm2, %edx -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512-NEXT: vpextrd $1, %xmm4, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm2, %esi -; AVX512-NEXT: vmovd %xmm4, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrd $2, %xmm2, %edx -; AVX512-NEXT: vpextrd $2, %xmm4, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrd $3, %xmm2, %edx -; AVX512-NEXT: vpextrd $3, %xmm4, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vpextrd $1, %xmm4, %edx -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: vmovd %xmm5, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm6 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $2, %xmm4, %edx -; AVX512-NEXT: vpextrd $2, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $3, %xmm4, %edx -; AVX512-NEXT: vpextrd $3, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4 -; AVX512-NEXT: vpextrd $1, %xmm4, %edx -; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5 -; AVX512-NEXT: vpextrd $1, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm4, %esi -; AVX512-NEXT: vmovd %xmm5, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm6 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $2, %xmm4, %edx -; AVX512-NEXT: vpextrd $2, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 -; AVX512-NEXT: vpextrd $3, %xmm4, %edx -; AVX512-NEXT: vpextrd $3, %xmm5, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 -; AVX512-NEXT: vpextrd $1, %xmm3, %edx -; AVX512-NEXT: vpextrd $1, %xmm1, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: vmovd %xmm1, %edi -; AVX512-NEXT: cmpl %esi, %edi -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovgl %ecx, %esi -; AVX512-NEXT: vmovd %esi, %xmm5 -; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrd $2, %xmm3, %edx -; AVX512-NEXT: vpextrd $2, %xmm1, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgl %ecx, %edx -; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 -; AVX512-NEXT: vpextrd $3, %xmm3, %edx -; AVX512-NEXT: vpextrd $3, %xmm1, %esi -; AVX512-NEXT: cmpl %edx, %esi -; AVX512-NEXT: cmovgl %ecx, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v32i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm4, %ecx +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: cmpl %ecx, %edx +; AVX512F-NEXT: movl $-1, %ecx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm4, %esi +; AVX512F-NEXT: vmovd %xmm5, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm6 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm4, %edx +; AVX512F-NEXT: vpextrd $2, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %edx +; AVX512F-NEXT: vpextrd $3, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm5, %esi +; AVX512F-NEXT: vmovd %xmm6, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm7 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm5, %edx +; AVX512F-NEXT: vpextrd $2, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx +; AVX512F-NEXT: vpextrd $3, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm5, %esi +; AVX512F-NEXT: vmovd %xmm6, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm7 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm5, %edx +; AVX512F-NEXT: vpextrd $2, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx +; AVX512F-NEXT: vpextrd $3, %xmm6, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm2, %edx +; AVX512F-NEXT: vpextrd $1, %xmm0, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm2, %esi +; AVX512F-NEXT: vmovd %xmm0, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm6 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm2, %edx +; AVX512F-NEXT: vpextrd $2, %xmm0, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm2, %edx +; AVX512F-NEXT: vpextrd $3, %xmm0, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vpextrd $1, %xmm2, %edx +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm4, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm2, %esi +; AVX512F-NEXT: vmovd %xmm4, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm5 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm2, %edx +; AVX512F-NEXT: vpextrd $2, %xmm4, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm2, %edx +; AVX512F-NEXT: vpextrd $3, %xmm4, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2 +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm4, %edx +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm4, %esi +; AVX512F-NEXT: vmovd %xmm5, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm6 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm4, %edx +; AVX512F-NEXT: vpextrd $2, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %edx +; AVX512F-NEXT: vpextrd $3, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm4, %edx +; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm4, %esi +; AVX512F-NEXT: vmovd %xmm5, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm6 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm4, %edx +; AVX512F-NEXT: vpextrd $2, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %edx +; AVX512F-NEXT: vpextrd $3, %xmm5, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm3, %edx +; AVX512F-NEXT: vpextrd $1, %xmm1, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vmovd %xmm3, %esi +; AVX512F-NEXT: vmovd %xmm1, %edi +; AVX512F-NEXT: cmpl %esi, %edi +; AVX512F-NEXT: movl $0, %esi +; AVX512F-NEXT: cmovgl %ecx, %esi +; AVX512F-NEXT: vmovd %esi, %xmm5 +; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm3, %edx +; AVX512F-NEXT: vpextrd $2, %xmm1, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgl %ecx, %edx +; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm3, %edx +; AVX512F-NEXT: vpextrd $3, %xmm1, %esi +; AVX512F-NEXT: cmpl %edx, %esi +; AVX512F-NEXT: cmovgl %ecx, %eax +; AVX512F-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v32i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vpextrd $1, %xmm4, %ecx +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: cmpl %ecx, %edx +; AVX512DQ-NEXT: movl $-1, %ecx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm4, %esi +; AVX512DQ-NEXT: vmovd %xmm5, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm6 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm5, %esi +; AVX512DQ-NEXT: vmovd %xmm6, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm7 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm5, %esi +; AVX512DQ-NEXT: vmovd %xmm6, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm7 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx +; AVX512DQ-NEXT: vpextrd $1, %xmm0, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm2, %esi +; AVX512DQ-NEXT: vmovd %xmm0, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm6 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm0, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm0, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512DQ-NEXT: vpextrd $1, %xmm4, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm2, %esi +; AVX512DQ-NEXT: vmovd %xmm4, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm5 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm4, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm4, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm4, %esi +; AVX512DQ-NEXT: vmovd %xmm5, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm6 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm4, %esi +; AVX512DQ-NEXT: vmovd %xmm5, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm6 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512DQ-NEXT: vpextrd $1, %xmm3, %edx +; AVX512DQ-NEXT: vpextrd $1, %xmm1, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vmovd %xmm3, %esi +; AVX512DQ-NEXT: vmovd %xmm1, %edi +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movl $0, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %esi +; AVX512DQ-NEXT: vmovd %esi, %xmm5 +; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpextrd $2, %xmm3, %edx +; AVX512DQ-NEXT: vpextrd $2, %xmm1, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgl %ecx, %edx +; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpextrd $3, %xmm3, %edx +; AVX512DQ-NEXT: vpextrd $3, %xmm1, %esi +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: cmovgl %ecx, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v32i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %ecx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: cmpl %ecx, %edx +; AVX512BW-NEXT: movl $-1, %ecx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm5, %esi +; AVX512BW-NEXT: vmovd %xmm6, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm7 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm5, %esi +; AVX512BW-NEXT: vmovd %xmm6, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm7 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx +; AVX512BW-NEXT: vpextrd $1, %xmm0, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm2, %esi +; AVX512BW-NEXT: vmovd %xmm0, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm0, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm0, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm2, %esi +; AVX512BW-NEXT: vmovd %xmm4, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm4, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm4, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm3, %edx +; AVX512BW-NEXT: vpextrd $1, %xmm1, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vmovd %xmm3, %esi +; AVX512BW-NEXT: vmovd %xmm1, %edi +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgl %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $2, %xmm3, %edx +; AVX512BW-NEXT: vpextrd $2, %xmm1, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgl %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %edx +; AVX512BW-NEXT: vpextrd $3, %xmm1, %esi +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: cmovgl %ecx, %eax +; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <32 x i32> %a0, %a1 ret <32 x i1> %1 } @@ -4342,291 +5779,987 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v64i16: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 -; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3 -; AVX512-NEXT: vpslld $31, %zmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kshiftlw $14, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm3 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $13, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $12, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $11, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $10, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $9, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $8, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $7, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $6, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $5, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $4, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $3, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $2, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftlw $1, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 -; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512-NEXT: kshiftlw $14, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm2 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $13, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $12, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $11, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $10, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $9, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $8, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $7, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $6, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $5, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $4, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $3, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $2, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftlw $1, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpsllw $7, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vpxor %ymm6, %ymm6, %ymm6 -; AVX512-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2 -; AVX512-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 -; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512-NEXT: vpslld $31, %zmm1, %zmm1 -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512-NEXT: kshiftlw $14, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $13, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $12, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $11, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $10, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $9, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $8, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $7, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $6, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $5, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $4, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $3, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $2, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftlw $1, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $14, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $13, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $12, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $11, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $10, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $9, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $8, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $7, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $6, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $5, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $4, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $3, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $2, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftlw $1, %k0, %k1 -; AVX512-NEXT: kshiftrw $15, %k1, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill> -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v64i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm2 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpxor %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512F-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill> +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v64i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm3 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm2 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpxor %ymm6, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm1 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v64i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vpextrw $1, %xmm4, %ecx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: cmpw %cx, %dx +; AVX512BW-NEXT: movw $-1, %cx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm5, %esi +; AVX512BW-NEXT: vmovd %xmm6, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm7 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm5, %esi +; AVX512BW-NEXT: vmovd %xmm6, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm7 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $1, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm2, %esi +; AVX512BW-NEXT: vmovd %xmm0, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm0, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vpextrw $1, %xmm2, %edx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512BW-NEXT: vpextrw $1, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm2, %esi +; AVX512BW-NEXT: vmovd %xmm4, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm4, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrw $1, %xmm4, %edx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrw $1, %xmm4, %edx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm4, %esi +; AVX512BW-NEXT: vmovd %xmm5, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm6 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX512BW-NEXT: vpextrw $1, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $1, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vmovd %xmm3, %esi +; AVX512BW-NEXT: vmovd %xmm1, %edi +; AVX512BW-NEXT: cmpw %si, %di +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: vmovd %esi, %xmm5 +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $2, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $2, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $3, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $3, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $4, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $4, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $5, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $5, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $6, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $6, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrw $7, %xmm3, %edx +; AVX512BW-NEXT: vpextrw $7, %xmm1, %esi +; AVX512BW-NEXT: cmpw %dx, %si +; AVX512BW-NEXT: cmovgw %cx, %ax +; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm5, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <64 x i16> %a0, %a1 ret <64 x i1> %1 } @@ -6240,50 +8373,103 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v128i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1 -; AVX512-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2 -; AVX512-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512-NEXT: vpslld $31, %zmm4, %zmm4 -; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k0 -; AVX512-NEXT: kmovw %k0, 14(%rdi) -; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512-NEXT: vpslld $31, %zmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kmovw %k0, 12(%rdi) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512-NEXT: vpslld $31, %zmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kmovw %k0, 10(%rdi) -; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 -; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512-NEXT: kmovw %k0, 8(%rdi) -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 -; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512-NEXT: kmovw %k0, 6(%rdi) -; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512-NEXT: vpslld $31, %zmm1, %zmm1 -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512-NEXT: kmovw %k0, 4(%rdi) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512-NEXT: vpslld $31, %zmm1, %zmm1 -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512-NEXT: kmovw %k0, 2(%rdi) -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kmovw %k0, (%rdi) -; AVX512-NEXT: movq %rdi, %rax -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v128i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 +; AVX512F-NEXT: vpslld $31, %zmm4, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-NEXT: kmovw %k0, 14(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: kmovw %k0, 12(%rdi) +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: kmovw %k0, 10(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kmovw %k0, 8(%rdi) +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kmovw %k0, 6(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, 4(%rdi) +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, 2(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rdi) +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v128i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4 +; AVX512DQ-NEXT: vpslld $31, %zmm4, %zmm4 +; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512DQ-NEXT: kmovw %k0, 14(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, 12(%rdi) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, 10(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 8(%rdi) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 6(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v128i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm1, %k0 +; AVX512BW-NEXT: vpcmpgtb %zmm2, %zmm0, %k1 +; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: retq %1 = icmp sgt <128 x i8> %a0, %a1 ret <128 x i1> %1 } @@ -6781,231 +8967,684 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v32f64: -; AVX512: # BB#0: -; AVX512-NEXT: vextractf32x4 $3, %zmm4, %xmm8 -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm9 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vucomisd %xmm8, %xmm9 -; AVX512-NEXT: movq $-1, %rcx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] -; AVX512-NEXT: vucomisd %xmm8, %xmm9 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm4, %xmm9 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm10 -; AVX512-NEXT: vucomisd %xmm9, %xmm10 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm11 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] -; AVX512-NEXT: vucomisd %xmm9, %xmm10 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512-NEXT: vextractf32x4 $1, %zmm4, %xmm9 -; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm10 -; AVX512-NEXT: vucomisd %xmm9, %xmm10 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm11 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] -; AVX512-NEXT: vucomisd %xmm9, %xmm10 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm8 -; AVX512-NEXT: vextractf32x4 $3, %zmm5, %xmm4 -; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm0 -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm5, %xmm4 -; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm0 -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vextractf32x4 $1, %zmm5, %xmm4 -; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm0 -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm0 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] -; AVX512-NEXT: vucomisd %xmm5, %xmm1 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512-NEXT: vucomisd %xmm5, %xmm1 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm8 -; AVX512-NEXT: vextractf32x4 $3, %zmm6, %xmm1 -; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vucomisd %xmm1, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vucomisd %xmm1, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm6, %xmm4 -; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512-NEXT: vucomisd %xmm4, %xmm5 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vextractf32x4 $1, %zmm6, %xmm1 -; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm4 -; AVX512-NEXT: vucomisd %xmm1, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vucomisd %xmm1, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512-NEXT: vucomisd %xmm6, %xmm2 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vucomisd %xmm5, %xmm2 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vextractf32x4 $3, %zmm7, %xmm1 -; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vucomisd %xmm1, %xmm2 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vucomisd %xmm1, %xmm2 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm7, %xmm2 -; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vextractf32x4 $1, %zmm7, %xmm2 -; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4 -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vucomisd %xmm2, %xmm4 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512-NEXT: vucomisd %xmm7, %xmm3 -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovaq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512-NEXT: vucomisd %xmm5, %xmm3 -; AVX512-NEXT: cmovaq %rcx, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v32f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm8 +; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm9 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: vucomisd %xmm8, %xmm9 +; AVX512F-NEXT: movq $-1, %rcx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512F-NEXT: vucomisd %xmm8, %xmm9 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm9 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 +; AVX512F-NEXT: vucomisd %xmm9, %xmm10 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm11 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512F-NEXT: vucomisd %xmm9, %xmm10 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-NEXT: vextractf32x4 $1, %zmm4, %xmm9 +; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm10 +; AVX512F-NEXT: vucomisd %xmm9, %xmm10 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm11 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512F-NEXT: vucomisd %xmm9, %xmm10 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm4 +; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm0 +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm4 +; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-NEXT: vextractf32x4 $1, %zmm5, %xmm4 +; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm0 +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm0 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm1 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm1 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm1 +; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vucomisd %xmm1, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vucomisd %xmm1, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm4 +; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512F-NEXT: vucomisd %xmm4, %xmm5 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextractf32x4 $1, %zmm6, %xmm1 +; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm4 +; AVX512F-NEXT: vucomisd %xmm1, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vucomisd %xmm1, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512F-NEXT: vucomisd %xmm6, %xmm2 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm2 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1 +; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vucomisd %xmm1, %xmm2 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vucomisd %xmm1, %xmm2 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm2 +; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vextractf32x4 $1, %zmm7, %xmm2 +; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4 +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512F-NEXT: vucomisd %xmm2, %xmm4 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512F-NEXT: vucomisd %xmm7, %xmm3 +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovaq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512F-NEXT: vucomisd %xmm5, %xmm3 +; AVX512F-NEXT: cmovaq %rcx, %rax +; AVX512F-NEXT: vmovq %rax, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v32f64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm4, %xmm8 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm9 +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: vucomisd %xmm8, %xmm9 +; AVX512DQ-NEXT: movq $-1, %rcx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512DQ-NEXT: vucomisd %xmm8, %xmm9 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm10 +; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm11 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm10 +; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm11 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm0 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm0 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm0 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm1 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm1 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm6, %xmm1 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm6, %xmm4 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm6, %xmm1 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm4 +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512DQ-NEXT: vucomisd %xmm6, %xmm2 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm2 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm7, %xmm1 +; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm2 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vucomisd %xmm1, %xmm2 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm7, %xmm2 +; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm7, %xmm2 +; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512DQ-NEXT: vucomisd %xmm7, %xmm3 +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovaq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0] +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512DQ-NEXT: vucomisd %xmm5, %xmm3 +; AVX512DQ-NEXT: cmovaq %rcx, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v32f64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm4, %xmm8 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm9 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: vucomisd %xmm8, %xmm9 +; AVX512BW-NEXT: movq $-1, %rcx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512BW-NEXT: vucomisd %xmm8, %xmm9 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm4, %xmm9 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm10 +; AVX512BW-NEXT: vucomisd %xmm9, %xmm10 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm11 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512BW-NEXT: vucomisd %xmm9, %xmm10 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm4, %xmm9 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm10 +; AVX512BW-NEXT: vucomisd %xmm9, %xmm10 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm11 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX512BW-NEXT: vucomisd %xmm9, %xmm10 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm5, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm0 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm5, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm5, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm0 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm0 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm1 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm1 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm6, %xmm1 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vucomisd %xmm1, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vucomisd %xmm1, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm6, %xmm4 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm6, %xmm1 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm4 +; AVX512BW-NEXT: vucomisd %xmm1, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vucomisd %xmm1, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512BW-NEXT: vucomisd %xmm6, %xmm2 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm2 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm7, %xmm1 +; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vucomisd %xmm1, %xmm2 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vucomisd %xmm1, %xmm2 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm7, %xmm2 +; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm7, %xmm2 +; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512BW-NEXT: vucomisd %xmm7, %xmm3 +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovaq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512BW-NEXT: vucomisd %xmm5, %xmm3 +; AVX512BW-NEXT: cmovaq %rcx, %rax +; AVX512BW-NEXT: vmovq %rax, %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %1 = fcmp ogt <32 x double> %a0, %a1 ret <32 x i1> %1 } @@ -7639,263 +10278,780 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cmp_v32i64: -; AVX512: # BB#0: -; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm8 -; AVX512-NEXT: vpextrq $1, %xmm8, %rcx -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm9, %rdx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: cmpq %rcx, %rdx -; AVX512-NEXT: movq $-1, %rcx -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vmovq %xmm8, %rdx -; AVX512-NEXT: vmovq %xmm9, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm9, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm10 -; AVX512-NEXT: vpextrq $1, %xmm10, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm11 -; AVX512-NEXT: vmovq %xmm9, %rdx -; AVX512-NEXT: vmovq %xmm10, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512-NEXT: vextracti32x4 $1, %zmm4, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm9, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm10 -; AVX512-NEXT: vpextrq $1, %xmm10, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm11 -; AVX512-NEXT: vmovq %xmm9, %rdx -; AVX512-NEXT: vmovq %xmm10, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm8 -; AVX512-NEXT: vextracti32x4 $3, %zmm5, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm9 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm5, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vextracti32x4 $1, %zmm5, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm10 -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vmovq %xmm5, %rdx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm8 -; AVX512-NEXT: vextracti32x4 $3, %zmm6, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm1, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm6, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vextracti32x4 $1, %zmm6, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX512-NEXT: vpextrq $1, %xmm6, %rdx -; AVX512-NEXT: vpextrq $1, %xmm2, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vmovq %xmm6, %rdx -; AVX512-NEXT: vmovq %xmm2, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm1 -; AVX512-NEXT: vextracti32x4 $3, %zmm7, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: vmovq %xmm2, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm7, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm2, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 -; AVX512-NEXT: vextracti32x4 $1, %zmm7, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm5 -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: vmovq %xmm4, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX512-NEXT: vpextrq $1, %xmm7, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovgq %rcx, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm4 -; AVX512-NEXT: vmovq %xmm7, %rdx -; AVX512-NEXT: vmovq %xmm3, %rsi -; AVX512-NEXT: cmpq %rdx, %rsi -; AVX512-NEXT: cmovgq %rcx, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cmp_v32i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm8 +; AVX512F-NEXT: vpextrq $1, %xmm8, %rcx +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm9 +; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: cmpq %rcx, %rdx +; AVX512F-NEXT: movq $-1, %rcx +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vmovq %xmm8, %rdx +; AVX512F-NEXT: vmovq %xmm9, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm9 +; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm10 +; AVX512F-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm11 +; AVX512F-NEXT: vmovq %xmm9, %rdx +; AVX512F-NEXT: vmovq %xmm10, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-NEXT: vextracti32x4 $1, %zmm4, %xmm9 +; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm10 +; AVX512F-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm11 +; AVX512F-NEXT: vmovq %xmm9, %rdx +; AVX512F-NEXT: vmovq %xmm10, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm9 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-NEXT: vextracti32x4 $1, %zmm5, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm10 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vmovq %xmm5, %rdx +; AVX512F-NEXT: vmovq %xmm1, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512F-NEXT: vextracti32x4 $3, %zmm6, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm1, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm6, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vmovq %xmm4, %rdx +; AVX512F-NEXT: vmovq %xmm5, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti32x4 $1, %zmm6, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512F-NEXT: vpextrq $1, %xmm6, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vmovq %xmm6, %rdx +; AVX512F-NEXT: vmovq %xmm2, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm7, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm2, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512F-NEXT: vextracti32x4 $2, %zmm7, %xmm2 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm2, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti32x4 $1, %zmm7, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm5 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm4, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512F-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: cmovgq %rcx, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm4 +; AVX512F-NEXT: vmovq %xmm7, %rdx +; AVX512F-NEXT: vmovq %xmm3, %rsi +; AVX512F-NEXT: cmpq %rdx, %rsi +; AVX512F-NEXT: cmovgq %rcx, %rax +; AVX512F-NEXT: vmovq %rax, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_cmp_v32i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm4, %xmm8 +; AVX512DQ-NEXT: vpextrq $1, %xmm8, %rcx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm9 +; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: cmpq %rcx, %rdx +; AVX512DQ-NEXT: movq $-1, %rcx +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vmovq %xmm8, %rdx +; AVX512DQ-NEXT: vmovq %xmm9, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm4, %xmm9 +; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm10 +; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm11 +; AVX512DQ-NEXT: vmovq %xmm9, %rdx +; AVX512DQ-NEXT: vmovq %xmm10, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm4, %xmm9 +; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm10 +; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm11 +; AVX512DQ-NEXT: vmovq %xmm9, %rdx +; AVX512DQ-NEXT: vmovq %xmm10, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm5, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm9 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm5, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm0, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm5, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm10 +; AVX512DQ-NEXT: vmovq %xmm0, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm5, %rdx +; AVX512DQ-NEXT: vmovq %xmm1, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm6, %xmm1 +; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm1, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm6, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vmovq %xmm4, %rdx +; AVX512DQ-NEXT: vmovq %xmm5, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm6, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm0, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm6, %rdx +; AVX512DQ-NEXT: vmovq %xmm2, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm7, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm0, %rdx +; AVX512DQ-NEXT: vmovq %xmm2, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm7, %xmm2 +; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm2, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm7, %xmm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm0, %rdx +; AVX512DQ-NEXT: vmovq %xmm4, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512DQ-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: movl $0, %edx +; AVX512DQ-NEXT: cmovgq %rcx, %rdx +; AVX512DQ-NEXT: vmovq %rdx, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm7, %rdx +; AVX512DQ-NEXT: vmovq %xmm3, %rsi +; AVX512DQ-NEXT: cmpq %rdx, %rsi +; AVX512DQ-NEXT: cmovgq %rcx, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: test_cmp_v32i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm8 +; AVX512BW-NEXT: vpextrq $1, %xmm8, %rcx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm9 +; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: cmpq %rcx, %rdx +; AVX512BW-NEXT: movq $-1, %rcx +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vmovq %xmm8, %rdx +; AVX512BW-NEXT: vmovq %xmm9, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm9 +; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm10 +; AVX512BW-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm11 +; AVX512BW-NEXT: vmovq %xmm9, %rdx +; AVX512BW-NEXT: vmovq %xmm10, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm4, %xmm9 +; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm10 +; AVX512BW-NEXT: vpextrq $1, %xmm10, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm11 +; AVX512BW-NEXT: vmovq %xmm9, %rdx +; AVX512BW-NEXT: vmovq %xmm10, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm5, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm9 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm0, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm5, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm10 +; AVX512BW-NEXT: vmovq %xmm0, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vmovq %xmm5, %rdx +; AVX512BW-NEXT: vmovq %xmm1, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm6, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm1, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm6, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vmovq %xmm4, %rdx +; AVX512BW-NEXT: vmovq %xmm5, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm6, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm0, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512BW-NEXT: vpextrq $1, %xmm6, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vmovq %xmm6, %rdx +; AVX512BW-NEXT: vmovq %xmm2, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm7, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vmovq %xmm0, %rdx +; AVX512BW-NEXT: vmovq %xmm2, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX512BW-NEXT: vextracti32x4 $2, %zmm7, %xmm2 +; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm2, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm7, %xmm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm5 +; AVX512BW-NEXT: vmovq %xmm0, %rdx +; AVX512BW-NEXT: vmovq %xmm4, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX512BW-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: movl $0, %edx +; AVX512BW-NEXT: cmovgq %rcx, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm4 +; AVX512BW-NEXT: vmovq %xmm7, %rdx +; AVX512BW-NEXT: vmovq %xmm3, %rsi +; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmovgq %rcx, %rax +; AVX512BW-NEXT: vmovq %rax, %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %1 = icmp sgt <32 x i64> %a0, %a1 ret <32 x i1> %1 } diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 39fbc7611de8..774d615ae896 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1244,8 +1244,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq ; @@ -1253,8 +1252,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512BW-NEXT: retq ; @@ -1435,8 +1433,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512F-NEXT: retq @@ -1445,8 +1442,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512BW-NEXT: retq @@ -1642,8 +1638,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq ; @@ -1651,8 +1646,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512BW-NEXT: retq ; @@ -1945,8 +1939,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -1954,8 +1947,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: retq ; @@ -2348,8 +2340,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -2357,8 +2348,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; @@ -2860,8 +2850,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX512-LABEL: load_sext_16i1_to_16i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -3398,8 +3387,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX512-LABEL: load_sext_16i1_to_16i16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; @@ -4244,12 +4232,11 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 ; AVX512-NEXT: kmovw 2(%rdi), %k2 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index 27b65b829923..440faa689fb8 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; @@ -321,13 +322,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: ; X32-SSE: # BB#0: @@ -499,30 +509,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: @@ -911,30 +901,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: splatvar_shift_v16i8: ; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: @@ -1221,13 +1191,21 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: ; X32-SSE: # BB#0: @@ -1384,31 +1362,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index ee1879b6696e..79902acfec24 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts @@ -212,13 +213,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -331,33 +340,41 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512-NEXT: vpsraw $4, %ymm3, %ymm4 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $2, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $1, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512-NEXT: vpsraw $4, %ymm0, %ymm3 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $2, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $1, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -608,34 +625,43 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512-NEXT: vpsraw $4, %ymm3, %ymm4 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $2, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $1, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512-NEXT: vpsraw $4, %ymm0, %ymm3 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $2, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $1, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = ashr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -804,13 +830,20 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -913,34 +946,41 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512-NEXT: vpsraw $4, %ymm3, %ymm4 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $2, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpsraw $1, %ymm3, %ymm4 -; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512-NEXT: vpsraw $4, %ymm0, %ymm3 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $2, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsraw $1, %ymm0, %ymm3 -; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll index 1280641c557b..2c9e433cfb2c 100644 --- a/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -26,25 +26,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512DQ-NEXT: vpsravd %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512DQ-NEXT: vpsravd %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512DQ-NEXT: vpsravd %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -1025,24 +1014,13 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-NEXT: vpsravd %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 42488f2ec3a7..a7e1a531b659 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. @@ -290,13 +291,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: ; X32-SSE: # BB#0: @@ -417,18 +427,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: @@ -701,18 +703,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: splatvar_shift_v16i8: ; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: @@ -955,13 +949,21 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: ; X32-SSE: # BB#0: @@ -1064,19 +1066,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index 5223d7bba353..25667e7d1661 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts @@ -189,13 +190,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -275,21 +284,29 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -490,22 +507,31 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = lshr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -659,13 +685,20 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -739,22 +772,29 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll index 4c3caf329fb7..3da8f9437e57 100644 --- a/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -27,25 +27,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512DQ-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -988,24 +977,13 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 5c89949e924b..8706078b40c9 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. @@ -245,13 +246,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: ; X32-SSE: # BB#0: @@ -367,17 +377,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: @@ -642,17 +645,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: splatvar_shift_v16i8: ; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: @@ -827,13 +823,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v8i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v8i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: ; X32-SSE: # BB#0: @@ -919,18 +920,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll index eb52ae3ccaca..a1ef2791c1b0 100644 --- a/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; @@ -164,13 +165,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift } @@ -240,20 +249,28 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: var_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: var_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift } @@ -446,21 +463,30 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = shl <32 x i8> %a, %splat ret <32 x i8> %shift @@ -571,13 +597,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i16: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i16: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> ret <16 x i16> %shift } @@ -645,21 +676,28 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: constant_shift_v32i8: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v32i8: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll index 520c3237a57f..b9c9b56427f1 100644 --- a/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/test/CodeGen/X86/vector-shift-shl-512.ll @@ -27,25 +27,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512DQ-NEXT: vpsllvd %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512DQ-NEXT: vpsllvd %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll index 2836d69a0fec..f4650ec741a7 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -178,13 +178,8 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_ ; ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll index 04d6b3733246..37fd022999e4 100644 --- a/test/CodeGen/X86/vector-shuffle-masked.ll +++ b/test/CodeGen/X86/vector-shuffle-masked.ll @@ -216,7 +216,8 @@ define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru, ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -686,3 +687,33 @@ define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x doub %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru ret <2 x double> %res } + +define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0] +; CHECK-NEXT: retq + %q = load double, double* %x, align 1 + %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 + %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %res = select <2 x i1> %mask.extract, <2 x double> %vecinit2.i, <2 x double> %passthru + ret <2 x double> %res +} + +define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) { +; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_maskz: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0] +; CHECK-NEXT: retq + %q = load double, double* %x, align 1 + %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 + %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %res = select <2 x i1> %mask.extract, <2 x double> %vecinit2.i, <2 x double> zeroinitializer + ret <2 x double> %res +} diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 3ad92737a2ef..4312b67546d2 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -71,13 +71,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -101,14 +100,13 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -157,13 +155,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -185,8 +182,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -215,8 +211,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -241,8 +236,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -271,8 +265,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -301,13 +294,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -337,10 +329,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -367,8 +359,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -403,9 +394,8 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512F-NEXT: andq $-32, %rsp ; AVX512F-NEXT: subq $96, %rsp ; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 |
