summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
commitca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0 (patch)
tree3a28a772df9b17aef34f49e3c727965ad28c0c93 /test
parent9df3605dea17e84f8183581f6103bd0c79e2a606 (diff)
Notes
Diffstat (limited to 'test')
-rw-r--r--test/Analysis/BasicAA/unreachable-block.ll2
-rw-r--r--test/Analysis/CostModel/X86/slm-arith-costs.ll28
-rw-r--r--test/Analysis/DependenceAnalysis/BasePtrBug.ll80
-rw-r--r--test/Analysis/ScalarEvolution/guards.ll6
-rw-r--r--test/Assembler/2003-11-11-ImplicitRename.ll3
-rw-r--r--test/Assembler/2007-11-26-AttributeOverload.ll2
-rw-r--r--test/Assembler/atomic.ll26
-rw-r--r--test/Bitcode/Inputs/module-hash-strtab1.ll10
-rw-r--r--test/Bitcode/Inputs/module-hash-strtab2.ll10
-rw-r--r--test/Bitcode/atomic-no-syncscope.ll17
-rw-r--r--test/Bitcode/atomic-no-syncscope.ll.bcbin0 -> 1000 bytes
-rw-r--r--test/Bitcode/atomic.ll4
-rw-r--r--test/Bitcode/compatibility-3.6.ll24
-rw-r--r--test/Bitcode/compatibility-3.7.ll24
-rw-r--r--test/Bitcode/compatibility-3.8.ll24
-rw-r--r--test/Bitcode/compatibility-3.9.ll24
-rw-r--r--test/Bitcode/compatibility-4.0.ll24
-rw-r--r--test/Bitcode/compatibility.ll24
-rw-r--r--test/Bitcode/memInstructions.3.2.ll104
-rw-r--r--test/Bitcode/module-hash-strtab.ll15
-rw-r--r--test/Bitcode/module_hash.ll8
-rw-r--r--test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll2
-rw-r--r--test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll2
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll8
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir30
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir38
-rw-r--r--test/CodeGen/AArch64/arm64-csldst-mmo.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-misched-memdep-bug.ll6
-rw-r--r--test/CodeGen/AArch64/fence-singlethread.ll2
-rw-r--r--test/CodeGen/AArch64/preferred-function-alignment.ll26
-rw-r--r--test/CodeGen/AArch64/tailcall_misched_graph.ll4
-rw-r--r--test/CodeGen/AMDGPU/add.i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/add.ll18
-rw-r--r--test/CodeGen/AMDGPU/add.v2i16.ll4
-rw-r--r--test/CodeGen/AMDGPU/add_i128.ll16
-rw-r--r--test/CodeGen/AMDGPU/add_i64.ll8
-rw-r--r--test/CodeGen/AMDGPU/addrspacecast.ll33
-rw-r--r--test/CodeGen/AMDGPU/alignbit-pat.ll2
-rw-r--r--test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll38
-rw-r--r--test/CodeGen/AMDGPU/and-gcn.ll3
-rw-r--r--test/CodeGen/AMDGPU/and.ll55
-rw-r--r--test/CodeGen/AMDGPU/any_extend_vector_inreg.ll6
-rw-r--r--test/CodeGen/AMDGPU/bitreverse.ll20
-rw-r--r--test/CodeGen/AMDGPU/bswap.ll2
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll8
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes.ll6
-rw-r--r--test/CodeGen/AMDGPU/clamp-omod-special-case.mir46
-rw-r--r--test/CodeGen/AMDGPU/coalescer_remat.ll2
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir187
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-mi-operands.ll2
-rw-r--r--test/CodeGen/AMDGPU/copy-illegal-type.ll62
-rw-r--r--test/CodeGen/AMDGPU/ctlz.ll75
-rw-r--r--test/CodeGen/AMDGPU/ctlz_zero_undef.ll78
-rw-r--r--test/CodeGen/AMDGPU/ctpop.ll91
-rw-r--r--test/CodeGen/AMDGPU/ctpop64.ll29
-rw-r--r--test/CodeGen/AMDGPU/cttz_zero_undef.ll19
-rw-r--r--test/CodeGen/AMDGPU/cvt_f32_ubyte.ll88
-rw-r--r--test/CodeGen/AMDGPU/detect-dead-lanes.mir10
-rw-r--r--test/CodeGen/AMDGPU/ds_read2.ll4
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_superreg.ll10
-rw-r--r--test/CodeGen/AMDGPU/ds_read2st64.ll6
-rw-r--r--test/CodeGen/AMDGPU/early-if-convert-cost.ll2
-rw-r--r--test/CodeGen/AMDGPU/early-if-convert.ll2
-rw-r--r--test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll11
-rw-r--r--test/CodeGen/AMDGPU/extractelt-to-trunc.ll14
-rw-r--r--test/CodeGen/AMDGPU/fabs.f16.ll14
-rw-r--r--test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll50
-rw-r--r--test/CodeGen/AMDGPU/fadd.f16.ll58
-rw-r--r--test/CodeGen/AMDGPU/fadd64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize-elimination.ll487
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.f16.ll18
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.ll2
-rw-r--r--test/CodeGen/AMDGPU/fcmp.f16.ll312
-rw-r--r--test/CodeGen/AMDGPU/fcmp64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fconst64.ll9
-rw-r--r--test/CodeGen/AMDGPU/fcopysign.f16.ll91
-rw-r--r--test/CodeGen/AMDGPU/fdiv.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fdiv.ll41
-rw-r--r--test/CodeGen/AMDGPU/fma-combine.ll34
-rw-r--r--test/CodeGen/AMDGPU/fma.f64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fma.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmax_legacy.ll10
-rw-r--r--test/CodeGen/AMDGPU/fmed3.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmin_legacy.ll10
-rw-r--r--test/CodeGen/AMDGPU/fmul.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/fmul64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f16.ll28
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f32.ll82
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.v2f16.ll18
-rw-r--r--test/CodeGen/AMDGPU/fneg-combines.ll62
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.ll6
-rw-r--r--test/CodeGen/AMDGPU/fneg.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/fold-immediate-output-mods.mir53
-rw-r--r--test/CodeGen/AMDGPU/fold-operands-order.mir6
-rw-r--r--test/CodeGen/AMDGPU/fp32_to_fp16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fpext.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/fptosi.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fptoui.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fptrunc.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/fract.f64.ll10
-rw-r--r--test/CodeGen/AMDGPU/fract.ll12
-rw-r--r--test/CodeGen/AMDGPU/frem.ll8
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.f64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.ll6
-rw-r--r--test/CodeGen/AMDGPU/fsub.f16.ll28
-rw-r--r--test/CodeGen/AMDGPU/fsub.ll24
-rw-r--r--test/CodeGen/AMDGPU/fsub64.ll4
-rw-r--r--test/CodeGen/AMDGPU/ftrunc.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i16.ll4
-rw-r--r--test/CodeGen/AMDGPU/global-smrd-unknown.ll20
-rw-r--r--test/CodeGen/AMDGPU/half.ll10
-rw-r--r--test/CodeGen/AMDGPU/imm.ll4
-rw-r--r--test/CodeGen/AMDGPU/immv216.ll8
-rw-r--r--test/CodeGen/AMDGPU/indirect-addressing-si.ll8
-rw-r--r--test/CodeGen/AMDGPU/inline-asm.ll4
-rw-r--r--test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.class.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.ceil.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.cos.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.exp2.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.floor.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.fma.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll24
-rw-r--r--test/CodeGen/AMDGPU/llvm.log2.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.maxnum.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.minnum.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.rint.f16.ll10
-rw-r--r--test/CodeGen/AMDGPU/llvm.round.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.sin.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.sqrt.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.trunc.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/load-global-f32.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/load-global-i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-i32.ll8
-rw-r--r--test/CodeGen/AMDGPU/load-global-i64.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-i8.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-weird-sizes.ll10
-rw-r--r--test/CodeGen/AMDGPU/lower-mem-intrinsics.ll12
-rw-r--r--test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir227
-rw-r--r--test/CodeGen/AMDGPU/mad-combine.ll106
-rw-r--r--test/CodeGen/AMDGPU/madak.ll6
-rw-r--r--test/CodeGen/AMDGPU/madmk.ll4
-rw-r--r--test/CodeGen/AMDGPU/max.ll4
-rw-r--r--test/CodeGen/AMDGPU/merge-stores.ll4
-rw-r--r--test/CodeGen/AMDGPU/mubuf.ll2
-rw-r--r--test/CodeGen/AMDGPU/mul.ll6
-rw-r--r--test/CodeGen/AMDGPU/multi-divergent-exit-region.ll4
-rw-r--r--test/CodeGen/AMDGPU/no-shrink-extloads.ll2
-rw-r--r--test/CodeGen/AMDGPU/or.ll6
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll2
-rw-r--r--test/CodeGen/AMDGPU/reduce-load-width-alignment.ll6
-rw-r--r--test/CodeGen/AMDGPU/regcoal-subrange-join.mir162
-rw-r--r--test/CodeGen/AMDGPU/reorder-stores.ll4
-rw-r--r--test/CodeGen/AMDGPU/rotl.i64.ll4
-rw-r--r--test/CodeGen/AMDGPU/rotr.i64.ll4
-rw-r--r--test/CodeGen/AMDGPU/rsq.ll8
-rw-r--r--test/CodeGen/AMDGPU/s_movk_i32.ll4
-rw-r--r--test/CodeGen/AMDGPU/sad.ll4
-rw-r--r--test/CodeGen/AMDGPU/saddo.ll6
-rw-r--r--test/CodeGen/AMDGPU/salu-to-valu.ll6
-rw-r--r--test/CodeGen/AMDGPU/scalar_to_vector.ll6
-rw-r--r--test/CodeGen/AMDGPU/schedule-global-loads.ll2
-rw-r--r--test/CodeGen/AMDGPU/scratch-buffer.ll4
-rw-r--r--test/CodeGen/AMDGPU/scratch-simple.ll6
-rw-r--r--test/CodeGen/AMDGPU/sdiv.ll6
-rw-r--r--test/CodeGen/AMDGPU/sdwa-peephole.ll24
-rw-r--r--test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll54
-rw-r--r--test/CodeGen/AMDGPU/select-vectors.ll6
-rw-r--r--test/CodeGen/AMDGPU/select.f16.ll63
-rw-r--r--test/CodeGen/AMDGPU/setcc-fneg-constant.ll6
-rw-r--r--test/CodeGen/AMDGPU/setcc.ll10
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg.ll8
-rw-r--r--test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll4
-rw-r--r--test/CodeGen/AMDGPU/sgpr-copy.ll4
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll2
-rw-r--r--test/CodeGen/AMDGPU/shift-i64-opts.ll4
-rw-r--r--test/CodeGen/AMDGPU/shl.ll4
-rw-r--r--test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir161
-rw-r--r--test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll2
-rw-r--r--test/CodeGen/AMDGPU/sign_extend.ll4
-rw-r--r--test/CodeGen/AMDGPU/sitofp.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/sminmax.ll26
-rw-r--r--test/CodeGen/AMDGPU/sminmax.v2i16.ll6
-rw-r--r--test/CodeGen/AMDGPU/spill-cfg-position.ll2
-rw-r--r--test/CodeGen/AMDGPU/sra.ll6
-rw-r--r--test/CodeGen/AMDGPU/srem.ll6
-rw-r--r--test/CodeGen/AMDGPU/srl.ll4
-rw-r--r--test/CodeGen/AMDGPU/ssubo.ll6
-rw-r--r--test/CodeGen/AMDGPU/sub.i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/sub.ll4
-rw-r--r--test/CodeGen/AMDGPU/sub.v2i16.ll16
-rw-r--r--test/CodeGen/AMDGPU/syncscopes.ll19
-rw-r--r--test/CodeGen/AMDGPU/trunc-bitcast-vector.ll4
-rw-r--r--test/CodeGen/AMDGPU/trunc.ll6
-rw-r--r--test/CodeGen/AMDGPU/uaddo.ll10
-rw-r--r--test/CodeGen/AMDGPU/udiv.ll8
-rw-r--r--test/CodeGen/AMDGPU/uitofp.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/urem.ll6
-rw-r--r--test/CodeGen/AMDGPU/usubo.ll12
-rw-r--r--test/CodeGen/AMDGPU/v_cndmask.ll12
-rw-r--r--test/CodeGen/AMDGPU/v_mac.ll10
-rw-r--r--test/CodeGen/AMDGPU/v_mac_f16.ll38
-rw-r--r--test/CodeGen/AMDGPU/vectorize-global-local.ll2
-rw-r--r--test/CodeGen/AMDGPU/vop-shrink-frame-index.mir161
-rw-r--r--test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir40
-rw-r--r--test/CodeGen/AMDGPU/vselect.ll25
-rw-r--r--test/CodeGen/AMDGPU/waitcnt-permute.mir12
-rw-r--r--test/CodeGen/AMDGPU/xor.ll8
-rw-r--r--test/CodeGen/AMDGPU/zext-i64-bit-operand.ll4
-rw-r--r--test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll24
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir1252
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll30
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir20
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir1612
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalizer.mir33
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir58
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-double.ll1
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-i64.ll1
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll8
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-ldm.ll4
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll2
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vfma.ll28
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll10
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vldm.ll6
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll2
-rw-r--r--test/CodeGen/ARM/fence-singlethread.ll2
-rw-r--r--test/CodeGen/ARM/ror.ll33
-rw-r--r--test/CodeGen/ARM/scavenging.mir66
-rw-r--r--test/CodeGen/AVR/branch-relaxation.ll96
-rw-r--r--test/CodeGen/AVR/ctlz.ll5
-rw-r--r--test/CodeGen/AVR/cttz.ll4
-rw-r--r--test/CodeGen/AVR/frmidx-iterator-bug.ll33
-rw-r--r--test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll15
-rw-r--r--test/CodeGen/AVR/pseudo/ANDIWRdK.mir6
-rw-r--r--test/CodeGen/AVR/pseudo/COMWRd.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/ORIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/SBCIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/SUBIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/select-mbb-placement-bug.ll6
-rw-r--r--test/CodeGen/BPF/undef.ll58
-rw-r--r--test/CodeGen/Generic/pr33094.ll18
-rw-r--r--test/CodeGen/Hexagon/convertdptoint.ll8
-rw-r--r--test/CodeGen/Hexagon/convertdptoll.ll4
-rw-r--r--test/CodeGen/Hexagon/convertsptoint.ll4
-rw-r--r--test/CodeGen/Hexagon/convertsptoll.ll4
-rw-r--r--test/CodeGen/Hexagon/dadd.ll8
-rw-r--r--test/CodeGen/Hexagon/dmul.ll8
-rw-r--r--test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll8
-rw-r--r--test/CodeGen/Hexagon/dsub.ll8
-rw-r--r--test/CodeGen/Hexagon/fadd.ll8
-rw-r--r--test/CodeGen/Hexagon/fmul.ll8
-rw-r--r--test/CodeGen/Hexagon/fsub.ll8
-rw-r--r--test/CodeGen/Hexagon/hasfp-crash1.ll82
-rw-r--r--test/CodeGen/Hexagon/hasfp-crash2.ll83
-rw-r--r--test/CodeGen/Hexagon/hvx-nontemporal.ll28
-rw-r--r--test/CodeGen/Hexagon/target-flag-ext.mir24
-rw-r--r--test/CodeGen/MIR/AArch64/atomic-memoperands.mir4
-rw-r--r--test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir19
-rw-r--r--test/CodeGen/MIR/AArch64/target-memoperands.mir22
-rw-r--r--test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir20
-rw-r--r--test/CodeGen/MIR/AMDGPU/syncscopes.mir98
-rw-r--r--test/CodeGen/MIR/AMDGPU/target-flags.mir29
-rw-r--r--test/CodeGen/MIR/Generic/runPass.mir2
-rw-r--r--test/CodeGen/MIR/Hexagon/target-flags.mir36
-rw-r--r--test/CodeGen/MIR/X86/tied-physical-regs-match.mir22
-rw-r--r--test/CodeGen/MSP430/Inst16mm.ll4
-rw-r--r--test/CodeGen/NVPTX/lower-aggr-copies.ll61
-rw-r--r--test/CodeGen/PowerPC/PR33636.ll702
-rw-r--r--test/CodeGen/PowerPC/atomics-regression.ll528
-rw-r--r--test/CodeGen/PowerPC/bitreverse.ll23
-rw-r--r--test/CodeGen/PowerPC/build-vector-tests.ll4
-rw-r--r--test/CodeGen/PowerPC/ppc-ctr-dead-code.ll38
-rw-r--r--test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll32
-rw-r--r--test/CodeGen/PowerPC/ppc64le-smallarg.ll4
-rw-r--r--test/CodeGen/PowerPC/pr33093.ll165
-rw-r--r--test/CodeGen/PowerPC/select-addrRegRegOnly.ll37
-rw-r--r--test/CodeGen/PowerPC/svr4-redzone.ll6
-rw-r--r--test/CodeGen/PowerPC/tailcall1-64.ll7
-rw-r--r--test/CodeGen/PowerPC/testBitReverse.ll105
-rw-r--r--test/CodeGen/PowerPC/vec_extract_p9.ll167
-rw-r--r--test/CodeGen/PowerPC/vec_int_ext.ll253
-rw-r--r--test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll16
-rw-r--r--test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir34
-rw-r--r--test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll22
-rw-r--r--test/CodeGen/WebAssembly/umulo-i64.ll21
-rw-r--r--test/CodeGen/X86/2012-08-16-setcc.ll42
-rw-r--r--test/CodeGen/X86/GC/badreadproto.ll2
-rw-r--r--test/CodeGen/X86/GC/badrootproto.ll2
-rw-r--r--test/CodeGen/X86/GC/badwriteproto.ll2
-rw-r--r--test/CodeGen/X86/GC/fat.ll2
-rw-r--r--test/CodeGen/X86/GC/outside.ll2
-rw-r--r--test/CodeGen/X86/GlobalISel/GV.ll63
-rw-r--r--test/CodeGen/X86/GlobalISel/add-vec.ll173
-rw-r--r--test/CodeGen/X86/GlobalISel/constant.ll9
-rw-r--r--test/CodeGen/X86/GlobalISel/ext-x86-64.ll2
-rw-r--r--test/CodeGen/X86/GlobalISel/ext.ll36
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-GV.mir31
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext.mir171
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir110
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll22
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar.ll20
-rw-r--r--test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir27
-rw-r--r--test/CodeGen/X86/GlobalISel/select-GV.mir99
-rw-r--r--test/CodeGen/X86/GlobalISel/select-constant.mir31
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir64
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir53
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir74
-rw-r--r--test/CodeGen/X86/GlobalISel/x86_64-fallback.ll18
-rw-r--r--test/CodeGen/X86/avg.ll6
-rw-r--r--test/CodeGen/X86/avx-cmp.ll197
-rw-r--r--test/CodeGen/X86/avx-load-store.ll277
-rw-r--r--test/CodeGen/X86/avx-schedule.ll648
-rw-r--r--test/CodeGen/X86/avx-unpack.ll166
-rw-r--r--test/CodeGen/X86/avx-vinsertf128.ll118
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll12
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll2
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll26
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll53
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll925
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll50906
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll156
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll104
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll1868
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll3483
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll3279
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool.ll685
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll156
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll419
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll1377
-rw-r--r--test/CodeGen/X86/block-placement.ll101
-rw-r--r--test/CodeGen/X86/bool-simplify.ll129
-rw-r--r--test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll1991
-rw-r--r--test/CodeGen/X86/bswap-wide-int.ll4
-rw-r--r--test/CodeGen/X86/build-vector-128.ll23
-rw-r--r--test/CodeGen/X86/build-vector-256.ll29
-rw-r--r--test/CodeGen/X86/build-vector-512.ll20
-rw-r--r--test/CodeGen/X86/cast-vsel.ll2
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll236
-rw-r--r--test/CodeGen/X86/cmov.ll205
-rw-r--r--test/CodeGen/X86/code_placement_cold_loop_blocks.ll5
-rw-r--r--test/CodeGen/X86/combine-avx-intrinsics.ll47
-rw-r--r--test/CodeGen/X86/combine-avx2-intrinsics.ll69
-rw-r--r--test/CodeGen/X86/combine-rotates.ll80
-rw-r--r--test/CodeGen/X86/combine-sse41-intrinsics.ll72
-rw-r--r--test/CodeGen/X86/constant-hoisting-bfi.ll52
-rw-r--r--test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll124
-rw-r--r--test/CodeGen/X86/extract-store.ll2
-rw-r--r--test/CodeGen/X86/extractelement-legalization-store-ordering.ll51
-rw-r--r--test/CodeGen/X86/fast-isel-abort-warm.ll19
-rw-r--r--test/CodeGen/X86/fast-isel-gc-intrinsics.ll57
-rw-r--r--test/CodeGen/X86/fastisel-softfloat.ll15
-rw-r--r--test/CodeGen/X86/fp128-i128.ll2
-rw-r--r--test/CodeGen/X86/gather-addresses.ll16
-rw-r--r--test/CodeGen/X86/half.ll1045
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll251
-rw-r--r--test/CodeGen/X86/optimize-max-1.ll51
-rw-r--r--test/CodeGen/X86/optimize-max-2.ll26
-rw-r--r--test/CodeGen/X86/pr15309.ll50
-rw-r--r--test/CodeGen/X86/pr23603.ll27
-rw-r--r--test/CodeGen/X86/pr33715.ll16
-rw-r--r--test/CodeGen/X86/rdrand-x86_64.ll19
-rw-r--r--test/CodeGen/X86/rdrand.ll119
-rw-r--r--test/CodeGen/X86/rdseed-x86_64.ll19
-rw-r--r--test/CodeGen/X86/rdseed.ll66
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll116
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll162
-rw-r--r--test/CodeGen/X86/regalloc-reconcile-broken-hints.ll2
-rw-r--r--test/CodeGen/X86/rotate4.ll104
-rw-r--r--test/CodeGen/X86/sbb.ll46
-rw-r--r--test/CodeGen/X86/select_const.ll113
-rw-r--r--test/CodeGen/X86/shift-codegen.ll42
-rw-r--r--test/CodeGen/X86/shift-folding.ll57
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-256.ll313
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-512.ll422
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll2
-rw-r--r--test/CodeGen/X86/sink-gep-before-mem-inst.ll25
-rw-r--r--test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll55
-rw-r--r--test/CodeGen/X86/sse-schedule.ll248
-rw-r--r--test/CodeGen/X86/sse2-schedule.ll598
-rw-r--r--test/CodeGen/X86/sse3-schedule.ll48
-rw-r--r--test/CodeGen/X86/sse41-schedule.ll222
-rw-r--r--test/CodeGen/X86/sse42-schedule.ll38
-rw-r--r--test/CodeGen/X86/sse4a-schedule.ll95
-rw-r--r--test/CodeGen/X86/ssse3-schedule.ll74
-rw-r--r--test/CodeGen/X86/swizzle-avx2.ll73
-rw-r--r--test/CodeGen/X86/tbm_patterns.ll502
-rw-r--r--test/CodeGen/X86/vec-copysign.ll2
-rw-r--r--test/CodeGen/X86/vec_return.ll17
-rw-r--r--test/CodeGen/X86/vec_shift6.ll9
-rw-r--r--test/CodeGen/X86/vec_unsafe-fp-math.ll15
-rw-r--r--test/CodeGen/X86/vector-popcnt-128.ll93
-rw-r--r--test/CodeGen/X86/vector-popcnt-256.ll14
-rw-r--r--test/CodeGen/X86/vector-popcnt-512.ll120
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-sse4a.ll86
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-ssse3.ll15
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse4a.ll129
-rw-r--r--test/CodeGen/X86/vector-truncate-combine.ll10
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll54
-rw-r--r--test/CodeGen/X86/vector-tzcnt-256.ll28
-rw-r--r--test/CodeGen/X86/vector-tzcnt-512.ll124
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll2
-rw-r--r--test/CodeGen/X86/x32-lea-1.ll10
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll229
-rw-r--r--test/CodeGen/X86/zext-shl.ll39
-rw-r--r--test/CodeGen/X86/zext-trunc.ll9
-rw-r--r--test/DebugInfo/COFF/asm.ll6
-rw-r--r--test/DebugInfo/COFF/cpp-mangling.ll4
-rw-r--r--test/DebugInfo/COFF/fp-stack.ll2
-rw-r--r--test/DebugInfo/COFF/globals.ll6
-rw-r--r--test/DebugInfo/COFF/inlining-files.ll4
-rw-r--r--test/DebugInfo/COFF/inlining-header.ll8
-rw-r--r--test/DebugInfo/COFF/inlining-levels.ll8
-rw-r--r--test/DebugInfo/COFF/inlining-same-name.ll6
-rw-r--r--test/DebugInfo/COFF/inlining.ll6
-rw-r--r--test/DebugInfo/COFF/int8-char-type.ll4
-rw-r--r--test/DebugInfo/COFF/local-constant.ll5
-rw-r--r--test/DebugInfo/COFF/local-variable-gap.ll7
-rw-r--r--test/DebugInfo/COFF/local-variables.ll26
-rw-r--r--test/DebugInfo/COFF/long-name.ll2
-rw-r--r--test/DebugInfo/COFF/multifile.ll8
-rw-r--r--test/DebugInfo/COFF/multifunction.ll12
-rw-r--r--test/DebugInfo/COFF/pieces.ll34
-rw-r--r--test/DebugInfo/COFF/register-variables.ll30
-rw-r--r--test/DebugInfo/COFF/simple.ll8
-rw-r--r--test/DebugInfo/COFF/typedef.ll4
-rw-r--r--test/DebugInfo/COFF/types-array.ll6
-rw-r--r--test/DebugInfo/COFF/types-basic.ll46
-rw-r--r--test/DebugInfo/COFF/udts.ll22
-rw-r--r--test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.obin0 -> 1584 bytes
-rw-r--r--test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s201
-rwxr-xr-xtest/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64-space (renamed from test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 space)bin8944 -> 8944 bytes
-rw-r--r--test/DebugInfo/PDB/Inputs/every-type.cpp63
-rw-r--r--test/DebugInfo/PDB/Inputs/every-type.pdbbin0 -> 102400 bytes
-rw-r--r--test/DebugInfo/PDB/Inputs/every-type.yaml272
-rw-r--r--test/DebugInfo/PDB/every-type.test261
-rw-r--r--test/DebugInfo/PDB/pdbdump-headers.test116
-rw-r--r--test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test3
-rw-r--r--test/DebugInfo/PDB/pdbdump-mergetypes.test6
-rw-r--r--test/DebugInfo/X86/dbg-declare-inalloca.ll14
-rw-r--r--test/DebugInfo/dwarfdump-str-offsets.test148
-rw-r--r--test/DebugInfo/invalid-relocations.test35
-rw-r--r--test/DebugInfo/llvm-symbolizer.test7
-rw-r--r--test/Instrumentation/MemorySanitizer/unsized_type.ll22
-rw-r--r--test/Instrumentation/ThreadSanitizer/atomic.ll8
-rw-r--r--test/LTO/Resolution/X86/linker-redef-thin.ll16
-rw-r--r--test/Linker/Inputs/syncscope-1.ll6
-rw-r--r--test/Linker/Inputs/syncscope-2.ll6
-rw-r--r--test/Linker/Inputs/thumb-module-inline-asm.ll3
-rw-r--r--test/Linker/link-arm-and-thumb-module-inline-asm.ll20
-rw-r--r--test/Linker/syncscopes.ll11
-rw-r--r--test/MC/AArch64/label-arithmetic-diags-elf.s51
-rw-r--r--test/MC/AMDGPU/gfx9_asm_all.s459
-rw-r--r--test/MC/AMDGPU/vop3p-err.s41
-rw-r--r--test/MC/AMDGPU/vop3p.s63
-rw-r--r--test/MC/ARM/elf-movt.s24
-rw-r--r--test/MC/ARM/invalid-instructions-spellcheck.s68
-rw-r--r--test/MC/ARM/ldr-pseudo-unpredictable.s16
-rw-r--r--test/MC/COFF/bad-expr.s3
-rw-r--r--test/MC/COFF/cv-def-range-gap.s16
-rw-r--r--test/MC/COFF/cv-def-range.s10
-rw-r--r--test/MC/COFF/cv-inline-linetable-infloop.s2
-rw-r--r--test/MC/COFF/cv-inline-linetable-unlikely.s4
-rw-r--r--test/MC/COFF/cv-inline-linetable-unreachable.s2
-rw-r--r--test/MC/COFF/cv-inline-linetable.s4
-rw-r--r--test/MC/Disassembler/Mips/mt/valid-r2-el.txt32
-rw-r--r--test/MC/Disassembler/Mips/mt/valid-r2.txt32
-rw-r--r--test/MC/ELF/bad-expr3.s3
-rw-r--r--test/MC/Mips/addend.s21
-rw-r--r--test/MC/Mips/mt/abiflag.s10
-rw-r--r--test/MC/Mips/mt/invalid-wrong-error.s3
-rw-r--r--test/MC/Mips/mt/invalid.s27
-rw-r--r--test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s18
-rw-r--r--test/MC/Mips/mt/mftr-mttr-aliases-invalid.s23
-rw-r--r--test/MC/Mips/mt/mftr-mttr-aliases.s47
-rw-r--r--test/MC/Mips/mt/mftr-mttr-reserved-valid.s8
-rw-r--r--test/MC/Mips/mt/module-directive-invalid.s6
-rw-r--r--test/MC/Mips/mt/module-directive.s16
-rw-r--r--test/MC/Mips/mt/set-directive.s14
-rw-r--r--test/MC/Mips/mt/valid.s33
-rw-r--r--test/MC/WebAssembly/array-fill.ll14
-rw-r--r--test/MC/WebAssembly/external-data.ll3
-rw-r--r--test/MC/WebAssembly/external-func-address.ll49
-rw-r--r--test/MC/WebAssembly/unnamed-data.ll3
-rw-r--r--test/MC/WebAssembly/weak-alias.ll37
-rw-r--r--test/Object/Inputs/trivial-object-test.wasmbin0 -> 303 bytes
-rw-r--r--test/Object/Inputs/trivial.ll3
-rw-r--r--test/Object/nm-trivial-object.test7
-rw-r--r--test/Object/obj2yaml.test25
-rw-r--r--test/Object/objdump-relocations.test7
-rw-r--r--test/ObjectYAML/wasm/data_section.yaml5
-rw-r--r--test/Other/2002-01-31-CallGraph.ll2
-rw-r--r--test/Other/new-pm-defaults.ll40
-rw-r--r--test/Other/new-pm-lto-defaults.ll11
-rw-r--r--test/Other/pass-pipelines.ll2
-rw-r--r--test/SafepointIRVerifier/basic-use-after-reloc.ll23
-rw-r--r--test/SafepointIRVerifier/compares.ll85
-rw-r--r--test/SafepointIRVerifier/constant-bases.ll70
-rw-r--r--test/SafepointIRVerifier/unrecorded-live-at-sp.ll71
-rw-r--r--test/SafepointIRVerifier/uses-in-phi-nodes.ll78
-rw-r--r--test/TableGen/AsmVariant.td1
-rw-r--r--test/TableGen/GlobalISelEmitter.td931
-rw-r--r--test/TableGen/UnterminatedComment.td2
-rw-r--r--test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll38
-rw-r--r--test/Transforms/CodeGenPrepare/X86/memcmp.ll77
-rw-r--r--test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll24
-rw-r--r--test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll16
-rw-r--r--test/Transforms/ConstantHoisting/ARM/bad-cases.ll31
-rw-r--r--test/Transforms/ConstantHoisting/ARM/insertvalue.ll31
-rw-r--r--test/Transforms/ConstantHoisting/X86/ehpad.ll5
-rw-r--r--test/Transforms/GVN/PRE/atomic.ll6
-rw-r--r--test/Transforms/GVN/PRE/phi-translate-2.ll131
-rw-r--r--test/Transforms/GVN/PRE/pre-gep-load.ll2
-rw-r--r--test/Transforms/GVN/PRE/pre-load.ll6
-rw-r--r--test/Transforms/IndVarSimplify/canonicalize-cmp.ll98
-rw-r--r--test/Transforms/IndVarSimplify/eliminate-comparison.ll4
-rw-r--r--test/Transforms/IndVarSimplify/strengthen-overflow.ll84
-rw-r--r--test/Transforms/IndVarSimplify/widen-loop-comp.ll2
-rw-r--r--test/Transforms/InferAddressSpaces/AMDGPU/basic.ll12
-rw-r--r--test/Transforms/Inline/ARM/inline-target-attr.ll60
-rw-r--r--test/Transforms/Inline/ARM/lit.local.cfg2
-rw-r--r--test/Transforms/Inline/cgscc-incremental-invalidate.ll105
-rw-r--r--test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll29
-rw-r--r--test/Transforms/InstCombine/and-or-not.ll24
-rw-r--r--test/Transforms/InstCombine/bswap-fold.ll161
-rw-r--r--test/Transforms/InstCombine/cmp-intrinsic.ll123
-rw-r--r--test/Transforms/InstCombine/consecutive-fences.ll12
-rw-r--r--test/Transforms/InstCombine/icmp.ll16
-rw-r--r--test/Transforms/InstCombine/intrinsics.ll60
-rw-r--r--test/Transforms/InstCombine/or-xor.ll24
-rw-r--r--test/Transforms/InstCombine/pr33689_same_bitwidth.ll53
-rw-r--r--test/Transforms/InstCombine/select-implied.ll77
-rw-r--r--test/Transforms/InstCombine/select.ll7
-rw-r--r--test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll29
-rw-r--r--test/Transforms/LoopRotate/pr33701.ll27
-rw-r--r--test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll4
-rw-r--r--test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll60
-rw-r--r--test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll353
-rw-r--r--test/Transforms/LoopUnroll/runtime-loop.ll68
-rw-r--r--test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll49
-rw-r--r--test/Transforms/LoopVectorize/if-conversion-nest.ll95
-rw-r--r--test/Transforms/LoopVectorize/pr33706.ll61
-rw-r--r--test/Transforms/LowerTypeTests/Inputs/import-icall.yaml1
-rw-r--r--test/Transforms/LowerTypeTests/import-icall.ll7
-rw-r--r--test/Transforms/NewGVN/pr33720.ll91
-rw-r--r--test/Transforms/PGOProfile/counter_promo_exit_merge.ll4
-rw-r--r--test/Transforms/PGOProfile/counter_promo_mexits.ll4
-rw-r--r--test/Transforms/PGOProfile/counter_promo_nest.ll165
-rw-r--r--test/Transforms/SimplifyCFG/implied-and-or.ll183
-rw-r--r--test/Transforms/SimplifyCFG/sink-common-code.ll24
-rw-r--r--test/Transforms/Sink/fence.ll8
-rw-r--r--test/Transforms/ThinLTOBitcodeWriter/pr33536.ll37
-rw-r--r--test/Unit/lit.cfg5
-rw-r--r--test/Verifier/2004-05-21-SwitchConstantMismatch.ll2
-rw-r--r--test/Verifier/2007-12-21-InvokeParamAttrs.ll2
-rw-r--r--test/Verifier/2008-01-11-VarargAttrs.ll2
-rw-r--r--test/Verifier/2009-05-29-InvokeResult1.ll2
-rw-r--r--test/Verifier/2009-05-29-InvokeResult2.ll2
-rw-r--r--test/Verifier/2009-05-29-InvokeResult3.ll2
-rw-r--r--test/Verifier/byval-1.ll2
-rw-r--r--test/Verifier/element-wise-atomic-memory-intrinsics.ll42
-rw-r--r--test/Verifier/gcread-ptrptr.ll2
-rw-r--r--test/Verifier/gcroot-alloca.ll2
-rw-r--r--test/Verifier/gcroot-meta.ll2
-rw-r--r--test/Verifier/gcroot-ptrptr.ll2
-rw-r--r--test/Verifier/gcwrite-ptrptr.ll2
-rw-r--r--test/lit.cfg5
-rw-r--r--test/tools/llvm-cov/threads.c11
-rw-r--r--test/tools/llvm-cov/zeroFunctionFile.c2
-rw-r--r--test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-armbin0 -> 360 bytes
-rw-r--r--test/tools/llvm-objdump/ARM/macho-reloc-half.test4
-rw-r--r--test/tools/llvm-objdump/Inputs/test.wasmbin181 -> 0 bytes
-rw-r--r--test/tools/llvm-objdump/Inputs/trivial.ll19
-rw-r--r--test/tools/llvm-objdump/Inputs/trivial.obj.wasmbin0 -> 303 bytes
-rw-r--r--test/tools/llvm-objdump/WebAssembly/symbol-table.test17
-rw-r--r--test/tools/llvm-objdump/wasm.txt35
-rw-r--r--test/tools/llvm-pdbdump/partial-type-stream.test3
-rw-r--r--test/tools/llvm-profdata/c-general.test4
-rw-r--r--test/tools/llvm-readobj/Inputs/trivial.ll14
-rw-r--r--test/tools/llvm-readobj/Inputs/trivial.obj.wasmbin221 -> 285 bytes
-rw-r--r--test/tools/llvm-readobj/codeview-linetables.test20
-rw-r--r--test/tools/llvm-readobj/file-headers.test3
-rw-r--r--test/tools/llvm-readobj/relocations.test15
-rw-r--r--test/tools/llvm-readobj/sections.test131
-rw-r--r--test/tools/llvm-readobj/symbols.test22
596 files changed, 80721 insertions, 12726 deletions
diff --git a/test/Analysis/BasicAA/unreachable-block.ll b/test/Analysis/BasicAA/unreachable-block.ll
index 551d18e3e0fb3..d6c149f816618 100644
--- a/test/Analysis/BasicAA/unreachable-block.ll
+++ b/test/Analysis/BasicAA/unreachable-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -aa-eval -disable-output < %s >& /dev/null
+; RUN: opt -basicaa -aa-eval -disable-output < %s > /dev/null 2>&1
; BasicAA shouldn't infinitely recurse on the use-def cycles in
; unreachable code.
diff --git a/test/Analysis/CostModel/X86/slm-arith-costs.ll b/test/Analysis/CostModel/X86/slm-arith-costs.ll
index 3673a5d9e0673..a767aa30b8ed1 100644
--- a/test/Analysis/CostModel/X86/slm-arith-costs.ll
+++ b/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -3,6 +3,20 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
+define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM: cost of 4 {{.*}} add <2 x i64>
+ %res = add <2 x i64> %a, %b
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM: cost of 4 {{.*}} sub <2 x i64>
+ %res = sub <2 x i64> %a, %b
+ ret <2 x i64> %res
+}
+
; 8bit mul
define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) {
entry:
@@ -13,7 +27,7 @@ entry:
define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) {
entry:
-; SLM: cost of 11 {{.*}} mul nsw <2 x i8>
+; SLM: cost of 17 {{.*}} mul nsw <2 x i8>
%res = mul nsw <2 x i8> %a, %b
ret <2 x i8> %res
}
@@ -97,7 +111,7 @@ entry:
define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) {
entry:
-; SLM: cost of 11 {{.*}} mul nsw <2 x i16>
+; SLM: cost of 17 {{.*}} mul nsw <2 x i16>
%res = mul nsw <2 x i16> %a, %b
ret <2 x i16> %res
}
@@ -181,7 +195,7 @@ entry:
define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) {
entry:
-; SLM: cost of 11 {{.*}} mul nsw <2 x i32>
+; SLM: cost of 17 {{.*}} mul nsw <2 x i32>
%res = mul nsw <2 x i32> %a, %b
ret <2 x i32> %res
}
@@ -217,28 +231,28 @@ entry:
define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) {
entry:
-; SLM: cost of 11 {{.*}} mul nsw <2 x i64>
+; SLM: cost of 17 {{.*}} mul nsw <2 x i64>
%res = mul nsw <2 x i64> %a, %b
ret <2 x i64> %res
}
define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) {
entry:
-; SLM: cost of 22 {{.*}} mul nsw <4 x i64>
+; SLM: cost of 34 {{.*}} mul nsw <4 x i64>
%res = mul nsw <4 x i64> %a, %b
ret <4 x i64> %res
}
define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) {
entry:
-; SLM: cost of 44 {{.*}} mul nsw <8 x i64>
+; SLM: cost of 68 {{.*}} mul nsw <8 x i64>
%res = mul nsw <8 x i64> %a, %b
ret <8 x i64> %res
}
define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) {
entry:
-; SLM: cost of 88 {{.*}} mul nsw <16 x i64>
+; SLM: cost of 136 {{.*}} mul nsw <16 x i64>
%res = mul nsw <16 x i64> %a, %b
ret <16 x i64> %res
}
diff --git a/test/Analysis/DependenceAnalysis/BasePtrBug.ll b/test/Analysis/DependenceAnalysis/BasePtrBug.ll
new file mode 100644
index 0000000000000..8de75df7dbdd7
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/BasePtrBug.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; Test that the dependence analysis generates the correct results when using
+; an aliased object that points to a different element in the same array.
+; PR33567 - https://bugs.llvm.org/show_bug.cgi?id=33567
+
+; void test1(int *A, int *B, int N) {
+; int *top = A;
+; int *bot = A + N/2;
+; for (int i = 0; i < N; i++)
+; B[i] = top[i] + bot[i];
+; }
+
+; CHECK-LABEL: test1
+; CHECK: da analyze - input [*|<]!
+
+define void @test1(i32* nocapture %A, i32* nocapture %B, i32 %N) #0 {
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %div = sdiv i32 %N, 2
+ %bot.gep = getelementptr i32, i32* %A, i32 %div
+ br label %for.body
+
+for.body:
+ %i = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %gep.0 = getelementptr i32, i32* %A, i32 %i
+ %gep.1 = getelementptr i32, i32* %bot.gep, i32 %i
+ %gep.B = getelementptr i32, i32* %B, i32 %i
+ %0 = load i32, i32* %gep.0, align 4
+ %1 = load i32, i32* %gep.1, align 4
+ %add = add nsw i32 %1, %0
+ store i32 %add, i32* %gep.B, align 4
+ %inc = add nsw i32 %i, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+; void test2(int *A, unsigned n) {
+; int *B = A + 1;
+; for (unsigned i = 0; i < n; ++i) {
+; A[i] = B[i];
+; }
+; }
+
+; CHECK-LABEL: test2
+; CHECK: da analyze - consistent anti [1]!
+
+define void @test2(i32*, i32) #3 {
+ %3 = getelementptr inbounds i32, i32* %0, i64 1
+ br label %4
+
+; <label>:4:
+ %.0 = phi i32 [ 0, %2 ], [ %14, %13 ]
+ %5 = sub i32 %1, 1
+ %6 = icmp ult i32 %.0, %5
+ br i1 %6, label %7, label %15
+
+; <label>:7:
+ %8 = zext i32 %.0 to i64
+ %9 = getelementptr inbounds i32, i32* %3, i64 %8
+ %10 = load i32, i32* %9, align 4
+ %11 = zext i32 %.0 to i64
+ %12 = getelementptr inbounds i32, i32* %0, i64 %11
+ store i32 %10, i32* %12, align 4
+ br label %13
+
+; <label>:13:
+ %14 = add i32 %.0, 1
+ br label %4
+
+; <label>:15:
+ ret void
+}
diff --git a/test/Analysis/ScalarEvolution/guards.ll b/test/Analysis/ScalarEvolution/guards.ll
index 52ad4dc73d417..d4b1f431ffc6a 100644
--- a/test/Analysis/ScalarEvolution/guards.ll
+++ b/test/Analysis/ScalarEvolution/guards.ll
@@ -19,7 +19,7 @@ entry:
loop:
; CHECK: loop:
; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 true) [ "deopt"() ]
-; CHECK: %iv.inc.cmp = icmp slt i32 %iv.inc, %len
+; CHECK: %iv.inc.cmp = icmp ult i32 %iv.inc, %len
; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %iv.inc.cmp) [ "deopt"() ]
; CHECK: leave:
@@ -41,7 +41,7 @@ leave:
define void @test_2(i32 %n, i32* %len_buf) {
; CHECK-LABEL: @test_2(
-; CHECK: [[LEN_SEXT:%[^ ]+]] = sext i32 %len to i64
+; CHECK: [[LEN_ZEXT:%[^ ]+]] = zext i32 %len to i64
; CHECK: br label %loop
entry:
@@ -52,7 +52,7 @@ loop:
; CHECK: loop:
; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK: %iv.inc.cmp = icmp slt i64 %indvars.iv.next, [[LEN_SEXT]]
+; CHECK: %iv.inc.cmp = icmp ult i64 %indvars.iv.next, [[LEN_ZEXT]]
; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %iv.inc.cmp) [ "deopt"() ]
; CHECK: leave:
diff --git a/test/Assembler/2003-11-11-ImplicitRename.ll b/test/Assembler/2003-11-11-ImplicitRename.ll
index 7bfd3c14bf194..84065a17846db 100644
--- a/test/Assembler/2003-11-11-ImplicitRename.ll
+++ b/test/Assembler/2003-11-11-ImplicitRename.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
void %test() {
%X = add int 0, 1
%X = add int 1, 2
ret void
}
-
diff --git a/test/Assembler/2007-11-26-AttributeOverload.ll b/test/Assembler/2007-11-26-AttributeOverload.ll
index aebc2e8d01e5f..ab5d514a38b6e 100644
--- a/test/Assembler/2007-11-26-AttributeOverload.ll
+++ b/test/Assembler/2007-11-26-AttributeOverload.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare i32 @atoi(i8*) nounwind readonly
declare i32 @atoi(i8*)
diff --git a/test/Assembler/atomic.ll b/test/Assembler/atomic.ll
index 148b95d88e307..a8b527f2f863c 100644
--- a/test/Assembler/atomic.ll
+++ b/test/Assembler/atomic.ll
@@ -5,14 +5,20 @@
define void @f(i32* %x) {
; CHECK: load atomic i32, i32* %x unordered, align 4
load atomic i32, i32* %x unordered, align 4
- ; CHECK: load atomic volatile i32, i32* %x singlethread acquire, align 4
- load atomic volatile i32, i32* %x singlethread acquire, align 4
+ ; CHECK: load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4
+ load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4
+ ; CHECK: load atomic volatile i32, i32* %x syncscope("agent") acquire, align 4
+ load atomic volatile i32, i32* %x syncscope("agent") acquire, align 4
; CHECK: store atomic i32 3, i32* %x release, align 4
store atomic i32 3, i32* %x release, align 4
- ; CHECK: store atomic volatile i32 3, i32* %x singlethread monotonic, align 4
- store atomic volatile i32 3, i32* %x singlethread monotonic, align 4
- ; CHECK: cmpxchg i32* %x, i32 1, i32 0 singlethread monotonic monotonic
- cmpxchg i32* %x, i32 1, i32 0 singlethread monotonic monotonic
+ ; CHECK: store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4
+ store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 3, i32* %x syncscope("workgroup") monotonic, align 4
+ store atomic volatile i32 3, i32* %x syncscope("workgroup") monotonic, align 4
+ ; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic
+ cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic
+ ; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("workitem") monotonic monotonic
+ cmpxchg i32* %x, i32 1, i32 0 syncscope("workitem") monotonic monotonic
; CHECK: cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire
cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire
; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
@@ -23,9 +29,13 @@ define void @f(i32* %x) {
atomicrmw add i32* %x, i32 10 seq_cst
; CHECK: atomicrmw volatile xchg i32* %x, i32 10 monotonic
atomicrmw volatile xchg i32* %x, i32 10 monotonic
- ; CHECK: fence singlethread release
- fence singlethread release
+ ; CHECK: atomicrmw volatile xchg i32* %x, i32 10 syncscope("agent") monotonic
+ atomicrmw volatile xchg i32* %x, i32 10 syncscope("agent") monotonic
+ ; CHECK: fence syncscope("singlethread") release
+ fence syncscope("singlethread") release
; CHECK: fence seq_cst
fence seq_cst
+ ; CHECK: fence syncscope("device") seq_cst
+ fence syncscope("device") seq_cst
ret void
}
diff --git a/test/Bitcode/Inputs/module-hash-strtab1.ll b/test/Bitcode/Inputs/module-hash-strtab1.ll
new file mode 100644
index 0000000000000..6b4a3fce07eff
--- /dev/null
+++ b/test/Bitcode/Inputs/module-hash-strtab1.ll
@@ -0,0 +1,10 @@
+source_filename = "foo.c"
+
+$com = comdat any
+
+define void @main() comdat($com) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
diff --git a/test/Bitcode/Inputs/module-hash-strtab2.ll b/test/Bitcode/Inputs/module-hash-strtab2.ll
new file mode 100644
index 0000000000000..87d2478145bc3
--- /dev/null
+++ b/test/Bitcode/Inputs/module-hash-strtab2.ll
@@ -0,0 +1,10 @@
+source_filename = "foo.c"
+
+$dat = comdat any
+
+define void @main() comdat($dat) {
+ call void @foo()
+ ret void
+}
+
+declare void @foo()
diff --git a/test/Bitcode/atomic-no-syncscope.ll b/test/Bitcode/atomic-no-syncscope.ll
new file mode 100644
index 0000000000000..a57507bc81468
--- /dev/null
+++ b/test/Bitcode/atomic-no-syncscope.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-dis -o - %s.bc | FileCheck %s
+
+; Backwards compatibility test: make sure we can process bitcode without
+; synchronization scope names encoded in it.
+
+; CHECK: load atomic i32, i32* %x unordered, align 4
+; CHECK: load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4
+; CHECK: store atomic i32 3, i32* %x release, align 4
+; CHECK: store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4
+; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic
+; CHECK: cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire
+; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
+; CHECK: cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic
+; CHECK: atomicrmw add i32* %x, i32 10 seq_cst
+; CHECK: atomicrmw volatile xchg i32* %x, i32 10 monotonic
+; CHECK: fence syncscope("singlethread") release
+; CHECK: fence seq_cst
diff --git a/test/Bitcode/atomic-no-syncscope.ll.bc b/test/Bitcode/atomic-no-syncscope.ll.bc
new file mode 100644
index 0000000000000..01d565eb4426e
--- /dev/null
+++ b/test/Bitcode/atomic-no-syncscope.ll.bc
Binary files differ
diff --git a/test/Bitcode/atomic.ll b/test/Bitcode/atomic.ll
index c09e74c1c2f24..bef3f2712935a 100644
--- a/test/Bitcode/atomic.ll
+++ b/test/Bitcode/atomic.ll
@@ -11,8 +11,8 @@ define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
; CHECK: cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
- cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
- ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
+ cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new syncscope("singlethread") release monotonic
+ ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new syncscope("singlethread") release monotonic
ret void
}
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index 8d51ee11a209b..cf6c30e7c26c1 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -551,8 +551,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -571,33 +571,33 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
; XXX: The parser spits out the load type here.
%ld.1 = load atomic i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index ebdf4c30587c9..180dad258b682 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -596,8 +596,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -616,32 +616,32 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
%ld.1 = load atomic i32, i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32, i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 57ea3e068376f..370c7f51a2b7d 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -627,8 +627,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -647,32 +647,32 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
%ld.1 = load atomic i32, i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32, i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 2a6cfe14cdb14..4115cbd8fe64d 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -698,8 +698,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -718,32 +718,32 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
%ld.1 = load atomic i32, i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32, i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index c83c107a2927a..eef925564ecbf 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -698,8 +698,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -718,32 +718,32 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
%ld.1 = load atomic i32, i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32, i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index ec69344947c5e..ebd727ba9aeee 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -705,8 +705,8 @@ define void @atomics(i32* %word) {
; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
%cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
- %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
- ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+ %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
+ ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
%atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
%atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
@@ -725,32 +725,32 @@ define void @atomics(i32* %word) {
; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
%atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
- %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
- %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
- ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+ %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+ %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+ ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
fence acquire
; CHECK: fence acquire
fence release
; CHECK: fence release
fence acq_rel
; CHECK: fence acq_rel
- fence singlethread seq_cst
- ; CHECK: fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
+ ; CHECK: fence syncscope("singlethread") seq_cst
%ld.1 = load atomic i32, i32* %word monotonic, align 4
; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
%ld.2 = load atomic volatile i32, i32* %word acquire, align 8
; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
- %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
- ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+ %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
+ ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16
store atomic i32 23, i32* %word monotonic, align 4
; CHECK: store atomic i32 23, i32* %word monotonic, align 4
store atomic volatile i32 24, i32* %word monotonic, align 4
; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
- store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
- ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+ store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
+ ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4
ret void
}
diff --git a/test/Bitcode/memInstructions.3.2.ll b/test/Bitcode/memInstructions.3.2.ll
index 1ab05b6d1b422..c530b6d2cb174 100644
--- a/test/Bitcode/memInstructions.3.2.ll
+++ b/test/Bitcode/memInstructions.3.2.ll
@@ -107,29 +107,29 @@ entry:
; CHECK-NEXT: %res8 = load atomic volatile i8, i8* %ptr1 seq_cst, align 1
%res8 = load atomic volatile i8, i8* %ptr1 seq_cst, align 1
-; CHECK-NEXT: %res9 = load atomic i8, i8* %ptr1 singlethread unordered, align 1
- %res9 = load atomic i8, i8* %ptr1 singlethread unordered, align 1
+; CHECK-NEXT: %res9 = load atomic i8, i8* %ptr1 syncscope("singlethread") unordered, align 1
+ %res9 = load atomic i8, i8* %ptr1 syncscope("singlethread") unordered, align 1
-; CHECK-NEXT: %res10 = load atomic i8, i8* %ptr1 singlethread monotonic, align 1
- %res10 = load atomic i8, i8* %ptr1 singlethread monotonic, align 1
+; CHECK-NEXT: %res10 = load atomic i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1
+ %res10 = load atomic i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1
-; CHECK-NEXT: %res11 = load atomic i8, i8* %ptr1 singlethread acquire, align 1
- %res11 = load atomic i8, i8* %ptr1 singlethread acquire, align 1
+; CHECK-NEXT: %res11 = load atomic i8, i8* %ptr1 syncscope("singlethread") acquire, align 1
+ %res11 = load atomic i8, i8* %ptr1 syncscope("singlethread") acquire, align 1
-; CHECK-NEXT: %res12 = load atomic i8, i8* %ptr1 singlethread seq_cst, align 1
- %res12 = load atomic i8, i8* %ptr1 singlethread seq_cst, align 1
+; CHECK-NEXT: %res12 = load atomic i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
+ %res12 = load atomic i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
-; CHECK-NEXT: %res13 = load atomic volatile i8, i8* %ptr1 singlethread unordered, align 1
- %res13 = load atomic volatile i8, i8* %ptr1 singlethread unordered, align 1
+; CHECK-NEXT: %res13 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") unordered, align 1
+ %res13 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") unordered, align 1
-; CHECK-NEXT: %res14 = load atomic volatile i8, i8* %ptr1 singlethread monotonic, align 1
- %res14 = load atomic volatile i8, i8* %ptr1 singlethread monotonic, align 1
+; CHECK-NEXT: %res14 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1
+ %res14 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1
-; CHECK-NEXT: %res15 = load atomic volatile i8, i8* %ptr1 singlethread acquire, align 1
- %res15 = load atomic volatile i8, i8* %ptr1 singlethread acquire, align 1
+; CHECK-NEXT: %res15 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") acquire, align 1
+ %res15 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") acquire, align 1
-; CHECK-NEXT: %res16 = load atomic volatile i8, i8* %ptr1 singlethread seq_cst, align 1
- %res16 = load atomic volatile i8, i8* %ptr1 singlethread seq_cst, align 1
+; CHECK-NEXT: %res16 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
+ %res16 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
ret void
}
@@ -193,29 +193,29 @@ entry:
; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1
store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1
-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread unordered, align 1
- store atomic i8 2, i8* %ptr1 singlethread unordered, align 1
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1
+ store atomic i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1
-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1
- store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1
+ store atomic i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1
-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread release, align 1
- store atomic i8 2, i8* %ptr1 singlethread release, align 1
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") release, align 1
+ store atomic i8 2, i8* %ptr1 syncscope("singlethread") release, align 1
-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1
- store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
+ store atomic i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1
- store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1
+ store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1
-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1
- store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1
+ store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1
-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1
- store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") release, align 1
+ store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") release, align 1
-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1
- store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
+ store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1
ret void
}
@@ -232,13 +232,13 @@ entry:
; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0
%res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic
; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0
- %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+ %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic
; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0
- %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+ %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic
; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
@@ -249,13 +249,13 @@ entry:
; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0
%res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire
; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0
- %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+ %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire
; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0
- %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+ %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire
; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
@@ -266,13 +266,13 @@ entry:
; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0
%res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic
; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0
- %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+ %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic
; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0
- %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+ %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic
; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
@@ -283,13 +283,13 @@ entry:
; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0
%res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire
; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0
- %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+ %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire
; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0
- %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+ %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire
; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
@@ -300,13 +300,13 @@ entry:
; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0
%res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst
; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0
- %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+ %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst
-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst
; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0
- %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+ %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst
ret void
}
diff --git a/test/Bitcode/module-hash-strtab.ll b/test/Bitcode/module-hash-strtab.ll
new file mode 100644
index 0000000000000..e5a1fb0c40779
--- /dev/null
+++ b/test/Bitcode/module-hash-strtab.ll
@@ -0,0 +1,15 @@
+; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump | grep '<HASH' > %t
+; RUN: opt -module-hash %S/Inputs/module-hash-strtab1.ll -o - | llvm-bcanalyzer -dump | grep '<HASH' >> %t
+; RUN: opt -module-hash %S/Inputs/module-hash-strtab2.ll -o - | llvm-bcanalyzer -dump | grep '<HASH' >> %t
+; RUN: sort %t | uniq | count 3
+
+source_filename = "foo.c"
+
+$com = comdat any
+
+define void @main() comdat($com) {
+ call void @foo()
+ ret void
+}
+
+declare void @foo()
diff --git a/test/Bitcode/module_hash.ll b/test/Bitcode/module_hash.ll
index 56f3fdc4b7eaa..b24819fe6fdec 100644
--- a/test/Bitcode/module_hash.ll
+++ b/test/Bitcode/module_hash.ll
@@ -1,7 +1,7 @@
; Check per module hash.
-; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump | FileCheck %s --check-prefix=MOD1
+; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump -check-hash=foo | FileCheck %s --check-prefix=MOD1
; MOD1: <HASH op0={{[0-9]*}} op1={{[0-9]*}} op2={{[0-9]*}} op3={{[0-9]*}} op4={{[0-9]*}} (match)/>
-; RUN: opt -module-hash %p/Inputs/module_hash.ll -o - | llvm-bcanalyzer -dump | FileCheck %s --check-prefix=MOD2
+; RUN: opt -module-hash %p/Inputs/module_hash.ll -o - | llvm-bcanalyzer -dump -check-hash=bar | FileCheck %s --check-prefix=MOD2
; MOD2: <HASH op0={{[0-9]*}} op1={{[0-9]*}} op2={{[0-9]*}} op3={{[0-9]*}} op4={{[0-9]*}} (match)/>
; Check that the hash matches in the combined index.
@@ -21,8 +21,8 @@
; RUN: cat %t.hash | FileCheck %s --check-prefix=COMBINED
; First capture the value of the hash for the two modules.
-; COMBINED: <HASH op0=[[HASH1_1:[0-9]*]] op1=[[HASH1_2:[0-9]*]] op2=[[HASH1_3:[0-9]*]] op3=[[HASH1_4:[0-9]*]] op4=[[HASH1_5:[0-9]*]] (match)/>
-; COMBINED: <HASH op0=[[HASH2_1:[0-9]*]] op1=[[HASH2_2:[0-9]*]] op2=[[HASH2_3:[0-9]*]] op3=[[HASH2_4:[0-9]*]] op4=[[HASH2_5:[0-9]*]] (match)/>
+; COMBINED: <HASH op0=[[HASH1_1:[0-9]*]] op1=[[HASH1_2:[0-9]*]] op2=[[HASH1_3:[0-9]*]] op3=[[HASH1_4:[0-9]*]] op4=[[HASH1_5:[0-9]*]]/>
+; COMBINED: <HASH op0=[[HASH2_1:[0-9]*]] op1=[[HASH2_2:[0-9]*]] op2=[[HASH2_3:[0-9]*]] op3=[[HASH2_4:[0-9]*]] op4=[[HASH2_5:[0-9]*]]/>
; Validate against the value extracted from the combined index
; COMBINED-DAG: <HASH abbrevid={{[0-9]*}} op0=[[HASH1_1]] op1=[[HASH1_2]] op2=[[HASH1_3]] op3=[[HASH1_4]] op4=[[HASH1_5]]/>
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index a4d259add6093..86766f194688c 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -29,7 +29,7 @@
; CHECK-NEXT: <VERSION
; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/>
; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
-; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=4/>
; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK>
; CHECK: <STRTAB_BLOCK
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
index b62090efe20b9..09a6bbcb51d5c 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -29,7 +29,7 @@
; CHECK-NEXT: <VERSION
; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/>
; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
-; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=4/>
; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK>
; CHECK: <STRTAB_BLOCK
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 50ad83feed859..10ce87c2a1873 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1328,16 +1328,16 @@ define void @test_load_store_atomics(i8* %addr) {
; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr)
; CHECK: [[V1:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr)
; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr)
-; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load singlethread seq_cst 1 from %ir.addr)
-; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store singlethread monotonic 1 into %ir.addr)
+; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load syncscope("singlethread") seq_cst 1 from %ir.addr)
+; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store syncscope("singlethread") monotonic 1 into %ir.addr)
%v0 = load atomic i8, i8* %addr unordered, align 1
store atomic i8 %v0, i8* %addr monotonic, align 1
%v1 = load atomic i8, i8* %addr acquire, align 1
store atomic i8 %v1, i8* %addr release, align 1
- %v2 = load atomic i8, i8* %addr singlethread seq_cst, align 1
- store atomic i8 %v2, i8* %addr singlethread monotonic, align 1
+ %v2 = load atomic i8, i8* %addr syncscope("singlethread") seq_cst, align 1
+ store atomic i8 %v2, i8* %addr syncscope("singlethread") monotonic, align 1
ret void
}
diff --git a/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir
new file mode 100644
index 0000000000000..8604b2769ba30
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir
@@ -0,0 +1,30 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+ define void @implicit_def() { ret void }
+...
+
+---
+# CHECK-LABEL: name: implicit_def
+name: implicit_def
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+
+# CHECK: body:
+# CHECK: [[DEF:%[0-9]+]] = IMPLICIT_DEF
+# CHECK: [[ADD:%[0-9]+]] = ADDWrr [[DEF]], [[DEF]]
+# CHECK: %w0 = COPY [[ADD]]
+body: |
+ bb.0:
+ %0(s32) = G_IMPLICIT_DEF
+ %1(s32) = G_ADD %0, %0
+ %w0 = COPY %1(s32)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir
new file mode 100644
index 0000000000000..43e682c6b6ca5
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir
@@ -0,0 +1,38 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+ define void @sdiv_s32_gpr() { ret void }
+...
+
+---
+# Check that we select a 32-bit GPR sdiv intrinsic into SDIVWrr for GPR32.
+# Also check that we constrain the register class of the COPY to GPR32.
+# CHECK-LABEL: name: sdiv_s32_gpr
+name: sdiv_s32_gpr
+legalized: true
+regBankSelected: true
+
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: gpr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+
+# CHECK: body:
+# CHECK: %0 = COPY %w0
+# CHECK: %1 = COPY %w1
+# CHECK: %2 = SDIVWr %0, %1
+body: |
+ bb.0:
+ liveins: %w0, %w1
+
+ %0(s32) = COPY %w0
+ %1(s32) = COPY %w1
+ %2(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.sdiv.i32), %0, %1
+ %w0 = COPY %2(s32)
+...
diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
index cfb8e3a38c492..37cc5411aa31b 100644
--- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll
+++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
@@ -13,9 +13,9 @@
; CHECK: SU(2): STRWui %WZR
; CHECK: SU(3): %X21<def>, %X20<def> = LDPXi %SP
; CHECK: Predecessors:
-; CHECK-NEXT: out SU(0)
-; CHECK-NEXT: out SU(0)
-; CHECK-NEXT: ord SU(0)
+; CHECK-NEXT: SU(0): Out
+; CHECK-NEXT: SU(0): Out
+; CHECK-NEXT: SU(0): Ord
; CHECK-NEXT: Successors:
define void @test1() {
entry:
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index cde62fcb3f95c..ad4feef7280f2 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -8,8 +8,8 @@
; CHECK: shiftable
; CHECK: SU(2): %vreg2<def> = SUBXri %vreg1, 20, 0
; CHECK: Successors:
-; CHECK-NEXT: data SU(4): Latency=1 Reg=%vreg2
-; CHECK-NEXT: data SU(3): Latency=2 Reg=%vreg2
+; CHECK-NEXT: SU(4): Data Latency=1 Reg=%vreg2
+; CHECK-NEXT: SU(3): Data Latency=2 Reg=%vreg2
; CHECK: ********** INTERVALS **********
define i64 @shiftable(i64 %A, i64 %B) {
%tmp0 = sub i64 %B, 20
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
index 748a4762d82f4..9cbf0cb3803a8 100644
--- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -7,11 +7,11 @@
; CHECK: misched_bug:BB#0 entry
; CHECK: SU(2): %vreg2<def> = LDRWui %vreg0, 1; mem:LD4[%ptr1_plus1] GPR32:%vreg2 GPR64common:%vreg0
; CHECK: Successors:
-; CHECK-NEXT: data SU(5): Latency=4 Reg=%vreg2
-; CHECK-NEXT: ord SU(4): Latency=0
+; CHECK-NEXT: SU(5): Data Latency=4 Reg=%vreg2
+; CHECK-NEXT: SU(4): Ord Latency=0
; CHECK: SU(3): STRWui %WZR, %vreg0, 0; mem:ST4[%ptr1] GPR64common:%vreg0
; CHECK: Successors:
-; CHECK: ord SU(4): Latency=0
+; CHECK: SU(4): Ord Latency=0
; CHECK: SU(4): STRWui %WZR, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1
; CHECK: SU(5): %W0<def> = COPY %vreg2; GPR32:%vreg2
; CHECK: ** ScheduleDAGMI::schedule picking next node
diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll
index 2ed744277385a..0af0e58a91d45 100644
--- a/test/CodeGen/AArch64/fence-singlethread.ll
+++ b/test/CodeGen/AArch64/fence-singlethread.ll
@@ -16,6 +16,6 @@ define void @fence_singlethread() {
; IOS: ; COMPILER BARRIER
; IOS-NOT: dmb
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll
new file mode 100644
index 0000000000000..88e6f5dd01c91
--- /dev/null
+++ b/test/CodeGen/AArch64/preferred-function-alignment.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt81 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt83 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt88 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s
+
+define void @test() {
+ ret void
+}
+
+; CHECK-LABEL: test
+; ALIGN2: .p2align 2
+; ALIGN3: .p2align 3
+; ALIGN4: .p2align 4
diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 4fbd8944f0322..7e76dac214a14 100644
--- a/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -37,8 +37,8 @@ declare void @callee2(i8*, i8*, i8*, i8*, i8*,
; CHECK: SU({{.*}}): [[VRB]]<def> = LDRXui <fi#-2>
; CHECK-NOT: SU
; CHECK: Successors:
-; CHECK: ord SU([[DEPSTOREB:.*]]): Latency=0
-; CHECK: ord SU([[DEPSTOREA:.*]]): Latency=0
+; CHECK: SU([[DEPSTOREB:.*]]): Ord Latency=0
+; CHECK: SU([[DEPSTOREA:.*]]): Ord Latency=0
; CHECK: SU([[DEPSTOREA]]): STRXui %vreg{{.*}}, <fi#-4>
; CHECK: SU([[DEPSTOREB]]): STRXui %vreg{{.*}}, <fi#-3>
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index bee13d8c17f1d..98848295a73b2 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -4,7 +4,7 @@
; GCN-LABEL: {{^}}v_test_add_i16:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_short [[ADD]]
define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -67,7 +67,7 @@ define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -86,7 +86,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
+; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 7e4546d2cfb3f..6dcd7c234dc6d 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -5,9 +5,9 @@
;FUNC-LABEL: {{^}}test1:
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
-;SI-NOT: [[REG]]
-;SI: buffer_store_dword [[REG]],
+;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
+;SI: v_mov_b32_e32 v[[REG]], s[[REG]]
+;SI: buffer_store_dword v[[REG]],
define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%a = load i32, i32 addrspace(1)* %in
@@ -21,8 +21,8 @@ define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -39,10 +39,10 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index 76f724c2b90ba..4baa35ca57c58 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -168,10 +168,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
-; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
; VI-NOT: shl
-; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; VI-NOT: and
; VI-NOT: shl
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll
index 00a125c2e44fb..d33965d4dda7a 100644
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@@ -19,10 +19,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128
; Check that the SGPR add operand is correctly moved to a VGPR.
; GCN-LABEL: {{^}}sgpr_operand:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
%foo = load i128, i128 addrspace(1)* %in, align 8
%result = add i128 %foo, %a
@@ -31,10 +31,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad
}
; GCN-LABEL: {{^}}sgpr_operand_reversed:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
%foo = load i128, i128 addrspace(1)* %in, align 8
%result = add i128 %a, %foo
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 62733d5bfb6c9..f673d91192b84 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add
; Check that the SGPR add operand is correctly moved to a VGPR.
; SI-LABEL: {{^}}sgpr_operand:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
%foo = load i64, i64 addrspace(1)* %in, align 8
%result = add i64 %foo, %a
@@ -32,8 +32,8 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr
; SGPR as other operand.
;
; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
%foo = load i64, i64 addrspace(1)* %in, align 8
%result = add i64 %a, %foo
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index b1e71722d80c5..a6aa9e7951515 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -10,20 +10,22 @@
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
-
-; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
-; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
-; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
@@ -48,6 +50,12 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
@@ -55,12 +63,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
-; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
-; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
-; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
diff --git a/test/CodeGen/AMDGPU/alignbit-pat.ll b/test/CodeGen/AMDGPU/alignbit-pat.ll
index ff5c8960fad36..3f07188063cde 100644
--- a/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}alignbit_shr_pat:
; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 0e5605961e10c..0c7160df2b96d 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
@@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
@@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
; FIXME: Should be able to get fdiv for 1.0 component
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
; CHECK: store volatile <2 x float> %arcp.25ulp
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
%x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
index 2aec03aff8a3a..ef11ae87267eb 100644
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -2,8 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_and_i64_br:
-; SI: v_and_b32
-; SI: v_and_b32
+; SI: s_and_b64
define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
entry:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index c356f8b87cfc6..ee0190149e92e 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #0
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,11 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
@@ -136,7 +137,9 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrs
; FUNC-LABEL: {{^}}v_and_constant_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, 1234567
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -145,7 +148,9 @@ define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrsp
; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, 64
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -154,7 +159,9 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 a
; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, -16
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -239,8 +246,11 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
; SI: v_and_b32
; SI: v_and_b32
define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
- %b = load i64, i64 addrspace(1)* %bptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
+ %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
+ %b = load i64, i64 addrspace(1)* %gep.b, align 8
%and = and i64 %a, %b
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -251,7 +261,9 @@ define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 1231231234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -299,26 +311,30 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; SI-NOT: and
; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 1234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
; SI-NOT: and
; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 64
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -326,13 +342,15 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr
; FIXME: Should be able to reduce load width
; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
; SI-NOT: and
; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
; SI-NOT: and
; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, -8
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -549,5 +567,4 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-
attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index c61c23222bc7e..cdc60ab504e01 100644
--- a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -2,9 +2,9 @@
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
-; GCN: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dword
+; GCN: s_load_dwordx4
+; GCN-DAG: s_load_dwordx4
+; GCN-DAG: s_load_dword
; GCN: {{buffer|flat}}_store_byte
; GCN: {{buffer|flat}}_store_byte
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 539373f7bdeb4..f29bfb46b94bd 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -2,6 +2,8 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
declare i16 @llvm.bitreverse.i16(i16) #1
declare i32 @llvm.bitreverse.i32(i32) #1
declare i64 @llvm.bitreverse.i64(i64) #1
@@ -42,12 +44,14 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val)
}
; FUNC-LABEL: {{^}}v_brev_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
store i32 %brev, i32 addrspace(1)* %out
ret void
@@ -66,7 +70,9 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI: v_bfrev_b32_e32
; SI: v_bfrev_b32_e32
define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
ret void
@@ -82,7 +88,9 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val)
; FUNC-LABEL: {{^}}v_brev_i64:
; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
- %val = load i64, i64 addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
+ %val = load i64, i64 addrspace(1)* %gep
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
store i64 %brev, i64 addrspace(1)* %out
ret void
@@ -97,7 +105,9 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FUNC-LABEL: {{^}}v_brev_v2i64:
define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
- %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
index d2dacd7c17b3f..eb3fc2fab34fd 100644
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
; FUNC-LABEL: @test_bswap_i32
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: s_load_dword [[VAL:s[0-9]+]]
; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 5dec3e35ab3d0..c114332a58872 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,9 +1,9 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
@@ -40,7 +40,7 @@ done:
; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
; OPT: getelementptr i32, i32 addrspace(4)* %out,
-; OPT-CI-NOT: getelementptr
+; rOPT-CI-NOT: getelementptr
; OPT: br i1
; OPT-CI: addrspacecast
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index c1cf56e5058ec..c01d834bc33d6 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,9 +1,9 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
index 6ecf75c1acec3..90fba03420901 100644
--- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
+++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -1,36 +1,4 @@
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s
---- |
- define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_max_reg_imm_f32() #0 {
- ret void
- }
-
- attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
-
-...
---
# GCN-LABEL: name: v_max_self_clamp_not_set_f32
# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
@@ -70,7 +38,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -132,7 +100,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -195,7 +163,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -260,7 +228,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -337,7 +305,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -402,7 +370,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -435,7 +403,7 @@ registers:
- { id: 0, class: vgpr_32 }
- { id: 1, class: vgpr_32 }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %vgpr0
%0 = COPY %vgpr0
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
index 3e1b76a1df094..14b798ba822b7 100644
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 8
+; CHECK: ; NumVgprs: 4
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
entry:
%cmpflag = icmp eq i32 %flag, 1
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index ed78ccc9b617c..0401f7b07e218 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,84 +1,5 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
---- |
- define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %and = and i32 %a, 1234567
- store volatile i32 %and, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %and = and i32 %a, 1234567
- store i32 %and, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %shl = shl i32 %a, 12
- store volatile i32 %shl, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %shl = shl i32 %a, 12
- store i32 %shl, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %ashr = ashr i32 %a, 12
- store volatile i32 %ashr, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %ashr = ashr i32 %a, 12
- store i32 %ashr, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %lshr = lshr i32 %a, 12
- store volatile i32 %lshr, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %lshr = lshr i32 %a, 12
- store i32 %lshr, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @undefined_vreg_operand() {
- unreachable
- }
-
- declare i32 @llvm.amdgcn.workitem.id.x() #1
-
- attributes #0 = { nounwind }
- attributes #1 = { nounwind readnone }
-
...
----
# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
@@ -119,11 +40,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %1 = S_LOAD_DWORDX2_IMM %0, 36, 0
%2 = COPY %1.sub1
%3 = COPY %1.sub0
%4 = S_MOV_B32 61440
@@ -133,7 +54,7 @@ body: |
%8 = S_MOV_B32 9999
%9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
%10 = COPY %9
- BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -204,12 +125,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
%32 = REG_SEQUENCE %3, 1, %31, 2
%33 = V_LSHLREV_B64 2, killed %32, implicit %exec
@@ -223,19 +144,19 @@ body: |
%34 = V_MOV_B32_e32 63, implicit %exec
%27 = V_AND_B32_e64 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_AND_B32_e64 %24, %26, implicit %exec
- FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
%29 = V_AND_B32_e32 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr
%30 = V_AND_B32_e64 %26, %26, implicit %exec
- FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr
%31 = V_AND_B32_e64 %34, %34, implicit %exec
- FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -285,11 +206,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 1
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -298,7 +219,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -390,7 +311,7 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
@@ -411,34 +332,34 @@ body: |
%27 = S_MOV_B32 -4
%11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_LSHL_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_LSHL_B32_e64 12, %7, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_LSHL_B32_e64 12, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_LSHL_B32_e64 %6, 12, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_LSHL_B32_e64 %6, 32, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_LSHL_B32_e32 %6, %6, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_LSHL_B32_e32 %27, %6, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -485,11 +406,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 999123
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -498,7 +419,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -593,12 +514,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0
%15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
%16 = REG_SEQUENCE %2, 1, %15, 2
%17 = V_LSHLREV_B64 2, killed %16, implicit %exec
@@ -619,34 +540,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_ASHR_I32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_ASHR_I32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_ASHR_I32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_ASHR_I32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_ASHR_I32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_ASHR_I32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_ASHR_I32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -693,11 +614,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 -999123
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -706,7 +627,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -802,12 +723,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0
%15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
%16 = REG_SEQUENCE %2, 1, %15, 2
%17 = V_LSHLREV_B64 2, killed %16, implicit %exec
@@ -828,34 +749,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_LSHR_B32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_LSHR_B32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_LSHR_B32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_LSHR_B32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_LSHR_B32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_LSHR_B32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_LSHR_B32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 8611cd080e15d..09d4b2c8bd774 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
-; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]]
; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index d772d1b679369..e39bd60a1cc88 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; FUNC-LABEL: {{^}}test_copy_v4i8:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -41,14 +47,16 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -57,7 +65,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
}
; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: v_lshrrev_b32
; GCN: v_and_b32
; GCN: v_or_b32
@@ -66,7 +74,9 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -97,19 +107,21 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
- %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
-; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
@@ -120,9 +132,9 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
@@ -135,10 +147,10 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_dword
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
@@ -148,10 +160,10 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index 149c50685b1db..a544cbe890b50 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
@@ -34,9 +34,9 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
}
; FUNC-LABEL: {{^}}v_ctlz_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
-; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
+; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -44,14 +44,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; EG: FFBH_UINT
; EG: CNDE_INT
define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
store i32 %ctlz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: buffer_store_dwordx2
@@ -62,14 +64,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; EG: FFBH_UINT
; EG: CNDE_INT
define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
@@ -90,16 +94,25 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; EG-DAG: FFBH_UINT
; EG-DAG: CNDE_INT
define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
+; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc
+
+; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]]
+; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]]
; GCN: buffer_store_byte [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
@@ -136,12 +149,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; FUNC-LABEL: {{^}}v_ctlz_i64:
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
-; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
+; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
@@ -168,12 +181,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -182,12 +197,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -197,13 +214,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
; TODO: Should be able to eliminate select here as well.
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp eq i32 %ctlz, 32
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -212,13 +231,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp ne i32 %ctlz, 32
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -242,7 +263,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
; SI: buffer_store_short [[FFBH]],
define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 48f3e4401f1a8..7500da536307f 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -29,21 +29,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
store i32 %ctlz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: buffer_store_dwordx2
@@ -52,14 +54,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
@@ -72,18 +76,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_byte [[RESULT]],
define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
- %val = load i8, i8 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
+ %val = load i8, i8 addrspace(1)* %in.gep
%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
store i8 %ctlz, i8 addrspace(1)* %out
ret void
@@ -116,11 +124,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
+; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
@@ -145,11 +153,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -158,11 +168,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -186,15 +198,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
; GCN-DAG: buffer_store_dword [[RESULT0]]
; GCN-DAG: buffer_store_byte [[RESULT1]]
; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -205,13 +219,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -221,13 +237,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 0
@@ -237,13 +255,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
; Compare on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 1
%sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -253,13 +273,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 1
%sel = select i1 %cmp, i32 %ctlz, i32 0
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index aa913ad406d2b..68b39bad2bc12 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -8,6 +8,8 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
; FUNC-LABEL: {{^}}s_ctpop_i32:
; GCN: s_load_dword [[SVAL:s[0-9]+]],
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
@@ -24,22 +26,24 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
; XXX - Why 0 in register?
; FUNC-LABEL: {{^}}v_ctpop_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
store i32 %ctpop, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -49,8 +53,11 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
- %val0 = load i32, i32 addrspace(1)* %in0, align 4
- %val1 = load i32, i32 addrspace(1)* %in1, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
+ %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
+ %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
+ %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
%add = add i32 %ctpop0, %ctpop1
@@ -59,15 +66,17 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
; GCN: s_waitcnt
; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
- %val0 = load i32, i32 addrspace(1)* %in0, align 4
- %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
- %add = add i32 %ctpop0, %sval
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
+ %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+ %add = add i32 %ctpop, %sval
store i32 %add, i32 addrspace(1)* %out, align 4
ret void
}
@@ -80,7 +89,9 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
ret void
@@ -98,7 +109,9 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
ret void
@@ -124,7 +137,9 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
%ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
ret void
@@ -166,21 +181,25 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
%ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
ret void
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, 4
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -188,14 +207,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 4, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -203,14 +224,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, 99999
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -218,7 +241,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
@@ -226,7 +249,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, %const
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -234,7 +259,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
@@ -242,7 +267,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %const, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -250,18 +277,22 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
+; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
+; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
- %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+ %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
%const = load i32, i32 addrspace(1)* %gep, align 4
%add = add i32 %const, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index f18bd9fd8174b..4850370851f63 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,6 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
@@ -25,14 +27,16 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val)
}
; FUNC-LABEL: {{^}}v_ctpop_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
- %val = load i64, i64 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %val = load i64, i64 addrspace(1)* %in.gep, align 8
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -40,7 +44,7 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
}
; FUNC-LABEL: {{^}}v_ctpop_i64_user:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
@@ -49,7 +53,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
- %val = load i64, i64 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %val = load i64, i64 addrspace(1)* %in.gep, align 8
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%or = or i64 %ctpop, %s.val
store i64 %or, i64 addrspace(1)* %out
@@ -87,7 +93,9 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
; GCN: v_bcnt_u32_b32
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
- %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
+ %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
%truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -105,7 +113,9 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <
; GCN: v_bcnt_u32_b32
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
- %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
+ %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -169,7 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
; FIXME: Should not have extra add
; FUNC-LABEL: {{^}}v_ctpop_i128:
-; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
@@ -182,7 +193,9 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
- %val = load i128, i128 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
+ %val = load i128, i128 addrspace(1)* %in.gep, align 8
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
%truncctpop = trunc i128 %ctpop to i32
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 1fa6407647eb8..1bfd38d94bfdf 100644
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -5,6 +5,7 @@
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
; SI: s_load_dword [[VAL:s[0-9]+]],
@@ -21,21 +22,23 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
store i32 %cttz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
-; SI: buffer_load_dwordx2
+; SI: {{buffer|flat}}_load_dwordx2
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
; SI: buffer_store_dwordx2
@@ -44,14 +47,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
; EG: FFBL_INT {{\*? *}}[[RESULT]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
-; SI: buffer_load_dwordx4
+; SI: {{buffer|flat}}_load_dwordx4
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
@@ -64,7 +69,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
; EG: FFBL_INT {{\*? *}}[[RESULT]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
ret void
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 0328ce31002df..f839129fc3d87 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; GCN-LABEL: {{^}}load_i8_to_f32:
-; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
- %load = load i8, i8 addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
+ %load = load i8, i8 addrspace(1)* %gep, align 1
%cvt = uitofp i8 %load to float
store float %cvt, float addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
-; GCN: buffer_load_ushort [[LD:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
%cvt = uitofp <2 x i8> %load to <2 x float>
store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: v_cvt_f32_ubyte3_e32
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
%cvt = uitofp <3 x i8> %load to <3 x float>
store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
@@ -53,7 +59,9 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
%cvt = uitofp <4 x i8> %load to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
ret void
@@ -64,10 +72,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
; FIXME: Packing bytes
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
; GCN-DAG: v_lshlrev_b32
; GCN-DAG: v_or_b32
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
@@ -77,7 +85,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
%cvt = uitofp <4 x i8> %load to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
ret void
@@ -124,14 +134,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
; GCN: s_endpgm
define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
%cvt = uitofp <7 x i8> %load to <7 x float>
store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
@@ -147,19 +159,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
%cvt = uitofp <8 x i8> %load to <8 x float>
store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %load = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %load = load i32, i32 addrspace(1)* %gep, align 4
%add = add i32 %load, 2
%inreg = and i32 %add, 255
%cvt = uitofp i32 %inreg to float
@@ -169,7 +185,9 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %load = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %load = load i32, i32 addrspace(1)* %gep, align 4
%inreg = and i32 %load, 65280
%shr = lshr i32 %inreg, 8
%cvt = uitofp i32 %shr to float
@@ -181,7 +199,9 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
; them so it shouldn't really matter.
; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
- %load = load i8, i8 addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
+ %load = load i8, i8 addrspace(1)* %gep, align 1
%ext = zext i8 %load to i32
%cvt = uitofp i32 %ext to float
store float %cvt, float addrspace(1)* %out, align 4
@@ -190,7 +210,9 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
%ext = zext <4 x i8> %load to <4 x i32>
%cvt = uitofp <4 x i32> %ext to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -198,12 +220,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
}
; GCN-LABEL: {{^}}extract_byte0_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%and = and i32 %val, 255
%cvt = uitofp i32 %and to float
store float %cvt, float addrspace(1)* %out
@@ -211,12 +235,14 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte1_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 8
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
@@ -225,12 +251,14 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte2_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 16
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
@@ -239,12 +267,14 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte3_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 24
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 3148b9b8ff9db..c265b8e2ad2ea 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,4 @@
# RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
---- |
- define amdgpu_kernel void @test0() { ret void }
- define amdgpu_kernel void @test1() { ret void }
- define amdgpu_kernel void @test2() { ret void }
- define amdgpu_kernel void @test3() { ret void }
- define amdgpu_kernel void @test4() { ret void }
- define amdgpu_kernel void @test5() { ret void }
- define amdgpu_kernel void @loop0() { ret void }
- define amdgpu_kernel void @loop1() { ret void }
- define amdgpu_kernel void @loop2() { ret void }
...
---
# Combined use/def transfer check, the basics.
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 2c474dbe7b086..deb90df99dcf4 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -9,7 +9,7 @@
; SI-LABEL: @simple_read2_f32
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
@@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
; SI-LABEL: @simple_read2_f32_max_offset
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 3dfdaf3936a64..ef4efc6336ce1 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -38,9 +38,9 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)*
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
-; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]]
-; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]]
+; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]]
; CI: buffer_store_dword v[[ADD2]]
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
@@ -64,8 +64,8 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)
; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
-; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
-; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]]
; CI: buffer_store_dword v[[ADD1]]
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 81b35a46aa188..b1fba8c240d7c 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -7,7 +7,7 @@
; SI-LABEL: @simple_read2st64_f32_0_1
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
@@ -26,7 +26,7 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0
; SI-LABEL: @simple_read2st64_f32_1_2
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -46,7 +46,7 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl
; SI-LABEL: @simple_read2st64_f32_max_offset
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index ace01593808b7..74404989f8c71 100644
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Most of these cases that don't trigger because of broken cost
; heuristics. Should not need -stress-early-ifcvt
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll
index 9439130deb9ef..792f0b1eaef46 100644
--- a/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: This leaves behind a now unnecessary and with exec
diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 6eb1fc1d0cc29..b7dfcd99029a0 100644
--- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -2,16 +2,21 @@
; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
; Test that the -enable-no-signed-zeros-fp-math flag works
; GCN-LABEL: {{^}}fneg_fsub_f32:
-; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
; GCN-UNSAFE-NOT: xor
define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
- %a = load float, float addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %tid, 1
+ %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
+ %a = load float, float addrspace(1)* %gep, align 4
%b = load float, float addrspace(1)* %b_ptr, align 4
%result = fsub float %a, %b
%neg.result = fsub float -0.0, %result
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index 34999fa3aea43..3fb452de1ccf4 100644
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -1,5 +1,7 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
; Make sure the add and load are reduced to 32-bits even with the
; bitcast to vector.
; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0:
@@ -8,7 +10,9 @@
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
; GCN: buffer_store_dword [[ADD]]
define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
- %a = load i64, i64 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep
%add = add i64 %a, %b
%val.bc = bitcast i64 %add to <2 x i32>
%extract = extractelement <2 x i32> %val.bc, i32 0
@@ -21,7 +25,9 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou
; GCN: v_add_f64
; GCN: buffer_store_dword v
define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
- %a = load double, double addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+ %a = load double, double addrspace(1)* %gep
%add = fadd double %a, %b
%val.bc = bitcast double %add to <2 x i32>
%extract = extractelement <2 x i32> %val.bc, i32 0
@@ -34,7 +40,9 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out
; GCN: v_add_i32
; GCN: buffer_store_dword
define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
- %a = load i64, i64 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep
%add = add i64 %a, %b
%val.bc = bitcast i64 %add to <2 x float>
%extract = extractelement <2 x float> %val.bc, i32 0
diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll
index 4e2ec4b3054fe..d56d5ec1411a9 100644
--- a/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
; VI: flat_load_ushort [[HI:v[0-9]+]]
; VI: flat_load_ushort [[LO:v[0-9]+]]
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
+; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]]
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]]
; VI: flat_store_dword
; GFX9: s_load_dword [[VAL:s[0-9]+]]
@@ -62,8 +62,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -80,7 +80,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
-; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]]
+; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]]
; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
@@ -134,7 +134,9 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%fmul = fmul <2 x half> %fabs, %val
store <2 x half> %fmul, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index 9edf55cbc69fe..0c4a77964d154 100644
--- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -16,8 +16,8 @@
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
+; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
@@ -49,7 +49,7 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
; GCN: buffer_load_dword [[V:v[0-9]+]]
; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
@@ -75,13 +75,13 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
; GCN-SLOWFMA: v_mul_f32_e32
; GCN-SLOWFMA: v_mul_f32_e32
@@ -108,13 +108,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
; GCN-SLOWFMA: v_mul_f32_e32
; GCN-SLOWFMA: v_mul_f32_e32
@@ -191,17 +191,17 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
-; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
+; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
; GCN: buffer_store_dword [[MUL]]
; GCN: buffer_store_dword [[MAD]]
@@ -226,21 +226,21 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
-; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
+; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_subrev_f32_e32
+; GCN-SLOWFMA: v_sub_f32_e32
define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index 08199be144f49..88b3be0e0d31c 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -2,13 +2,13 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fadd_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fadd_f16(
@@ -24,7 +24,7 @@ entry:
}
; GCN-LABEL: {{^}}fadd_f16_imm_a
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -42,7 +42,7 @@ entry:
}
; GCN-LABEL: {{^}}fadd_f16_imm_b
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -60,8 +60,8 @@ entry:
}
; GCN-LABEL: {{^}}fadd_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -70,16 +70,16 @@ entry:
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -88,15 +88,18 @@ define amdgpu_kernel void @fadd_v2f16(
<2 x half> addrspace(1)* %a,
<2 x half> addrspace(1)* %b) {
entry:
- %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
- %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+ %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+ %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
+ %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
%r.val = fadd <2 x half> %a.val, %b.val
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@@ -105,12 +108,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -118,14 +121,16 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
entry:
- %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+ %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
%r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
@@ -134,12 +139,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -147,8 +152,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
entry:
- %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+ %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
%r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index c936d98673ba1..8fd1f52006fbb 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -5,8 +5,11 @@
; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
- %r0 = load double, double addrspace(1)* %in1
- %r1 = load double, double addrspace(1)* %in2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid
+ %r0 = load double, double addrspace(1)* %gep1
+ %r1 = load double, double addrspace(1)* %gep2
%r2 = fadd double %r0, %r1
store double %r2, double addrspace(1)* %out
ret void
@@ -42,3 +45,8 @@ define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x do
store <2 x double> %r2, <2 x double> addrspace(1)* %out
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
new file mode 100644
index 0000000000000..5383bbe71ae36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -0,0 +1,487 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s
+
+; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %v = load float, float addrspace(1)* %gep, align 4
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fmul float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
+; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float 15.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
+; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fadd float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
+; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.sqrt.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
+; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.ceil.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
+; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.floor.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
+; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
+; GCN: flat_load_dword [[LOAD:v[0-9]+]],
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.canonicalize.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
+; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fpext float %load to double
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+ store double %canonicalized, double addrspace(1)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
+; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = fpext half %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
+; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v = fptrunc double %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fptrunc float %load to half
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+ store half %canonicalized, half addrspace(1)* %gep2, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
+; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
+; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
+; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
+; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
+; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
+ %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+ %v = fptrunc <2 x float> %load to <2 x half>
+ %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
+ %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
+ store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float -0.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
+; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = fsub float -0.0, %v0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
+define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.fabs.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
+; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.fabs.f32(float %v0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
+; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.sin.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
+; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.cos.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
+; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.sin.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
+; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.cos.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
+; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
+; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
+; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v0 = fadd double %load, 0.0
+ %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ store double %canonicalized, double addrspace(1)* %gep, align 8
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
+; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NEXT: ; return
+; GCN-NOT: 1.0
+define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul nnan float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.canonicalize.f64(double) #0
+declare half @llvm.canonicalize.f16(half) #0
+declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.sqrt.f32(float) #0
+declare float @llvm.ceil.f32(float) #0
+declare float @llvm.floor.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.sin.f32(float) #0
+declare float @llvm.cos.f32(float) #0
+declare half @llvm.sin.f16(half) #0
+declare half @llvm.cos.f16(half) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 404358f0ecb98..dd8e277c1c75f 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -5,6 +5,8 @@ declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
@@ -213,7 +215,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
; GFX9: buffer_store_dword [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
@@ -233,7 +237,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
; GCN: buffer_store_dword
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -251,7 +257,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
; GCN: buffer_store_dword
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
@@ -270,7 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
; GFX9: buffer_store_dword [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
index 8c385f40b1c5f..feb4c7bd4a183 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll
index 7916226462f77..aef898b1a8ee8 100644
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fcmp_f16_lt
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -351,23 +351,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_lt:
+; SI: v_cmp_lt_f32_e32 vcc,
+; SI: v_cmp_lt_f32_e32 vcc,
+
+; VI: v_cmp_lt_f16_e32 vcc,
+; VI: v_cmp_lt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -382,22 +371,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_eq
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_eq_f32_e32 vcc,
+; SI: v_cmp_eq_f32_e32 vcc,
+
+; VI: v_cmp_eq_f16_e32 vcc,
+; VI: v_cmp_eq_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_eq(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -411,23 +389,11 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_le
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_le:
+; SI: v_cmp_le_f32_e32 vcc
+; SI: v_cmp_le_f32_e32 vcc
+; VI: v_cmp_le_f16_e32 vcc
+; VI: v_cmp_le_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_le(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -441,23 +407,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_gt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_gt:
+; SI: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e32 vcc,
+
+; VI: v_cmp_gt_f16_e32 vcc,
+; VI: v_cmp_gt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_gt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -471,23 +426,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lg
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_lg:
+; SI: v_cmp_lg_f32_e32 vcc,
+; SI: v_cmp_lg_f32_e32 vcc,
+
+; VI: v_cmp_lg_f16_e32 vcc,
+; VI: v_cmp_lg_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lg(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -501,23 +445,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_ge
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_ge:
+; SI: v_cmp_ge_f32_e32 vcc,
+; SI: v_cmp_ge_f32_e32 vcc,
+
+; VI: v_cmp_ge_f16_e32 vcc,
+; VI: v_cmp_ge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ge(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -531,23 +464,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_o
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_o:
+; SI: v_cmp_o_f32_e32 vcc,
+; SI: v_cmp_o_f32_e32 vcc,
+
+; VI: v_cmp_o_f16_e32 vcc,
+; VI: v_cmp_o_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_o(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -561,23 +483,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_u
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_u:
+; SI: v_cmp_u_f32_e32 vcc,
+; SI: v_cmp_u_f32_e32 vcc,
+
+; VI: v_cmp_u_f16_e32 vcc,
+; VI: v_cmp_u_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_u(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -592,22 +503,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nge
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nge_f32_e32 vcc,
+; SI: v_cmp_nge_f32_e32 vcc,
+
+; VI: v_cmp_nge_f16_e32 vcc,
+; VI: v_cmp_nge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_nge(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -622,22 +522,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nlg
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nlg_f32_e32 vcc
+; SI: v_cmp_nlg_f32_e32 vcc
+
+; VI: v_cmp_nlg_f16_e32 vcc
+; VI: v_cmp_nlg_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_nlg(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -652,22 +541,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_ngt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_ngt_f32_e32 vcc,
+; SI: v_cmp_ngt_f32_e32 vcc,
+
+; VI: v_cmp_ngt_f16_e32 vcc,
+; VI: v_cmp_ngt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ngt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -682,22 +560,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nle
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_nle(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -712,22 +579,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_neq
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_neq(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -744,17 +600,19 @@ entry:
; GCN-LABEL: {{^}}fcmp_v2f16_nlt
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
+
+; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
+; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
+
+; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
index b9e1921d4c455..95f7e0be7d9c9 100644
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -2,7 +2,7 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
; CHECK-LABEL: {{^}}flt_f64:
-; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -14,7 +14,7 @@ define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fle_f64:
-; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -26,7 +26,7 @@ define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fgt_f64:
-; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -38,7 +38,7 @@ define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fge_f64:
-; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fne_f64:
-; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -62,7 +62,7 @@ define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1
}
; CHECK-LABEL: {{^}}feq_f64:
-; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
index 1255977962454..ca313d80894a6 100644
--- a/test/CodeGen/AMDGPU/fconst64.ll
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -6,8 +6,15 @@
; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
- %r1 = load double, double addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid
+ %r1 = load double, double addrspace(1)* %gep
%r2 = fadd double %r1, 5.000000e+00
store double %r2, double addrspace(1)* %out
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 4e2bf765cd95f..8e984246cc94d 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
declare half @llvm.copysign.f16(half, half)
declare float @llvm.copysign.f32(float, float)
@@ -9,16 +9,18 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>)
declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
; GCN-LABEL: {{^}}test_copysign_f16:
-; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
-; SI: buffer_load_ushort v[[MAG:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
-; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
; GCN: buffer_store_short v[[OUT]]
@@ -36,8 +38,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
@@ -48,17 +50,20 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
half addrspace(1)* %arg_mag,
float addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
%mag.ext = fpext half %mag to float
- %sign = load float, float addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+ %sign = load float, float addrspace(1)* %arg_sign_gep
%out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
store float %out, float addrspace(1)* %arg_out
ret void
}
; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
@@ -70,17 +75,20 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
half addrspace(1)* %arg_mag,
double addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
%mag.ext = fpext half %mag to double
- %sign = load double, double addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+ %sign = load double, double addrspace(1)* %arg_sign_gep
%out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
store double %out, double addrspace(1)* %arg_out
ret void
}
; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
@@ -93,8 +101,11 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
float addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load float, float addrspace(1)* %arg_mag
- %sign = load half, half addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+ %mag = load float, float addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%sign.ext = fpext half %sign to float
%out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
store float %out, float addrspace(1)* %arg_out
@@ -102,8 +113,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
@@ -116,8 +127,11 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
double addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load double, double addrspace(1)* %arg_mag
- %sign = load half, half addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
+ %mag = load double, double addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%sign.ext = fpext half %sign to double
%out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
store double %out, double addrspace(1)* %arg_out
@@ -125,8 +139,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
@@ -141,8 +155,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
half addrspace(1)* %arg_mag,
float addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
- %sign = load float, float addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+ %sign = load float, float addrspace(1)* %arg_sign_gep
%sign.trunc = fptrunc float %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
store half %out, half addrspace(1)* %arg_out
@@ -150,8 +167,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
@@ -166,8 +183,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
half addrspace(1)* %arg_mag,
double addrspace(1)* %arg_sign) {
entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
%mag = load half, half addrspace(1)* %arg_mag
- %sign = load double, double addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+ %sign = load double, double addrspace(1)* %arg_sign_gep
%sign.trunc = fptrunc double %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
store half %out, half addrspace(1)* %arg_out
@@ -175,8 +195,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
@@ -193,9 +213,12 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
float addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load float, float addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+ %mag = load float, float addrspace(1)* %arg_mag_gep
%mag.trunc = fptrunc float %mag to half
- %sign = load half, half addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %out, half addrspace(1)* %arg_out
ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7f84e973c9582..333143393cb46 100644
--- a/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -27,7 +27,7 @@
; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
-; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]]
+; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -165,7 +165,7 @@ entry:
; VI: flat_load_ushort [[RHS:v[0-9]+]]
; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
+; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
@@ -187,7 +187,7 @@ entry:
; VI: flat_load_ushort [[RHS:v[0-9]+]]
; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
+; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 738a5adba14fb..bc489454341a0 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -20,7 +20,7 @@
; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
@@ -45,7 +45,7 @@ entry:
; GCN-NOT: s_setreg
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
@@ -85,20 +85,11 @@ entry:
}
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
-; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
-; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
-
-; GCN-NOT: s_setreg
-; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
-; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
-; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
; GCN-NOT: s_setreg
-; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
-; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
entry:
%fdiv = fdiv fast float %a, %b
@@ -121,6 +112,21 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+ %fdiv = fdiv fast float %a, %b, !fpmath !0
+ store float %fdiv, float addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
@@ -154,8 +160,9 @@ entry:
}
; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
-; GCN: v_cmp_gt_f32
-; GCN: v_cmp_gt_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN-NOT: v_cmp_gt_f32
define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index 4113ba8dc1f07..7526d08bdbe52 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -387,7 +387,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace
; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
@@ -403,7 +403,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
@@ -419,7 +419,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
@@ -435,7 +435,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
@@ -451,7 +451,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
@@ -467,7 +467,7 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
@@ -483,7 +483,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
@@ -499,7 +499,7 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
@@ -515,7 +515,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
@@ -531,7 +531,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
@@ -547,7 +547,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
@@ -563,7 +563,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
@@ -583,8 +583,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_interp:
; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]]
-; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
+; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
;
; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
index 4d3f3712621ef..907121f1cd46b 100644
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.fma.f64(double, double, double) nounwind readnone
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index 659cecb59ebf7..6be4c450a51ed 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare float @llvm.fma.f32(float, float, float) nounwind readnone
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 7643c3ea533ce..44c80b63bf7c3 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -71,7 +71,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -91,7 +91,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
index 27d9261b1fab8..4cfc9fc80fb07 100644
--- a/test/CodeGen/AMDGPU/fmed3.ll
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -872,8 +872,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(fl
; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
-; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
+; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 52336f95a9096..0494295fc15f0 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -83,7 +83,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -121,7 +121,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index cd86409e20384..5f120f63d7fe3 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fmul_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_f16(
@@ -70,16 +70,16 @@ entry:
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -108,7 +108,7 @@ entry:
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_v2f16_imm_a(
@@ -134,7 +134,7 @@ entry:
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
index f14233f267b2b..d37d432842f37 100644
--- a/test/CodeGen/AMDGPU/fmul64.ll
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FUNC-LABEL: {{^}}fmul_f64:
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 9b713419e7471..980d68ceded87 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
@@ -108,7 +108,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
@@ -227,8 +227,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -257,8 +257,8 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -287,7 +287,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -319,7 +319,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -347,13 +347,13 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
-; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]]
+; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -385,7 +385,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -416,7 +416,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add
; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e422550266924..4b1e41ff91e17 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,12 +1,12 @@
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -96,7 +96,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -125,10 +125,10 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -160,10 +160,10 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -192,7 +192,7 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -221,7 +221,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -252,7 +252,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -282,7 +282,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -310,11 +310,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -345,11 +345,11 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -379,10 +379,10 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; SI: buffer_store_dword [[RESULT]]
@@ -414,10 +414,10 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; SI: buffer_store_dword [[RESULT]]
@@ -446,17 +446,17 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca
; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
+; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
; SI-FLUSH: buffer_store_dword [[REGC]]
; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -489,10 +489,10 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -525,10 +525,10 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -556,10 +556,10 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll
index 86e91e04b0fc3..8d91a56ee4211 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
-; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
; GCN-LABEL: {{^}}fmuladd_f64:
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index 624610096cbc5..b50a26c023ca3 100644
--- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 66bf9d0ffb00e..002bc47fb96ae 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -9,7 +9,7 @@
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
@@ -31,7 +31,7 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp
; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NEXT: buffer_store_dword [[ADD]]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
@@ -82,10 +82,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32
+; GCN-SAFE: v_sub_f32_e32
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
-; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -106,10 +106,10 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -133,7 +133,7 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float
; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -157,11 +157,11 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
-; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -185,10 +185,10 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
-; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
@@ -235,7 +235,7 @@ define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrsp
; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[ADD]]
@@ -280,7 +280,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out
; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -300,7 +300,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, fl
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -364,7 +364,7 @@ define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %
; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[MUL]]
@@ -974,7 +974,7 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)*
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
@@ -1000,7 +1000,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[ADD]]
@@ -1494,7 +1494,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr
; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out
; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1556,7 +1556,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspac
; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[MUL]]
@@ -1664,7 +1664,7 @@ define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addr
; GCN-LABEL: {{^}}v_fneg_round_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_trunc_f32_e32
-; GCN: v_subrev_f32_e32
+; GCN: v_sub_f32_e32
; GCN: v_cndmask_b32
; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
@@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
; GCN: s_cbranch_scc1
; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
-; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
+; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
; GCN: buffer_store_dword [[MUL1]]
; GCN: buffer_store_dword [[MUL0]]
@@ -1851,7 +1851,7 @@ define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
; GCN: ; use [[NEG]]
; GCN: buffer_store_dword [[MUL]]
@@ -1984,8 +1984,8 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)*
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
-; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
-; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
; GCN: buffer_store_dword [[MUL1]]
; GCN-NEXT: buffer_store_dword [[MUL2]]
@@ -2084,7 +2084,7 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)*
; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
-; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
+; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
; GCN: buffer_store_dword [[FMA0]]
; GCN: buffer_store_dword [[MUL1]]
define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index f4afaca2b7a7f..56aea641d16e6 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -5,7 +5,7 @@
; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
; CI: v_cvt_f32_f16_e32
; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
-; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
+; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]]
; GFX89-NOT: _and
; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
@@ -20,7 +20,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x,
; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
; CI-DAG: v_cvt_f32_f16_e32
; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
-; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
+; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]]
; CI: v_cvt_f16_f32_e32
; GFX89-NOT: _and
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index 0a7346f410c94..3f20ca73e9228 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
; SI-NOT: and
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index 2d94726cbe204..49d6742527468 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
; FIXME: Should be able to do scalar op
; GCN-LABEL: {{^}}s_fneg_f16:
@@ -46,7 +46,7 @@ define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
-; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
+; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]]
; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
index 986c6b296c962..3155b7a8664fb 100644
--- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
+++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -1,26 +1,5 @@
# RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
---- |
- define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 {
- ret void
- }
-
- attributes #0 = { nounwind }
-
...
----
# GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32
# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
# GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
@@ -62,14 +41,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -133,14 +112,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -204,14 +183,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -275,14 +254,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir
index afde89d6b64bc..51bb357fcf6ee 100644
--- a/test/CodeGen/AMDGPU/fold-operands-order.mir
+++ b/test/CodeGen/AMDGPU/fold-operands-order.mir
@@ -1,10 +1,4 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s
-
---- |
- define amdgpu_kernel void @mov_in_use_list_2x() {
- unreachable
- }
-
...
---
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 2c6b1cb18f7e6..579a1454dd9ae 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index 15cc73b9ee53e..ec19fd199b4ec 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; GCN-LABEL: {{^}}fpext_f16_to_f32
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -154,7 +154,7 @@ entry:
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
-; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
+; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]]
; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll
index f310618d8bdb6..f593030764a99 100644
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fptosi_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -60,7 +60,7 @@ entry:
; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]]
; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll
index 7641c08e33c36..cebe3304d542b 100644
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fptoui_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -60,7 +60,7 @@ entry:
; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll
index bc72f4424c98f..64df625d4bb5a 100644
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
@@ -38,10 +38,10 @@ entry:
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
@@ -68,7 +68,7 @@ entry:
; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 9a56cbe983cdd..1314dfe3c7cab 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
declare double @llvm.fabs.f64(double) #0
declare double @llvm.floor.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
index 207fe280c9a69..2217f67da7d3b 100644
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.floor.f32(float) #0
; GCN-LABEL: {{^}}fract_f32:
; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[INPUT]], [[FLR]]
; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index 9778069d0477b..3b8f58cc18a7b 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}frem_f32:
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
@@ -29,7 +29,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)
; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
-; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[X]], [[INVY]]
; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
; GCN: buffer_store_dword [[RESULT]]
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 453d8fb37f2f4..186757e4c5d84 100644
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index a0fd3411ca05c..6bd9a0db14f66 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index fa00c06546dbd..15a4ce2d88f7d 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}fsub_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fsub_f16(
@@ -70,16 +70,16 @@ entry:
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
@@ -109,12 +109,12 @@ entry:
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
@@ -143,12 +143,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index e7a92d95d4859..48647a2cdb898 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_fsub_f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
%a = load float, float addrspace(1)* %in, align 4
@@ -41,10 +41,10 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
@@ -67,7 +67,7 @@ define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x flo
}
; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -80,7 +80,7 @@ define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrs
}
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI-NOT: xor
define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -93,7 +93,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float a
}
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI-NOT: xor
define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -109,7 +109,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %ou
; make sure it is disabled and the fneg is not folded if it is not
; "true".
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index dc332414a1527..73f1a69eeb9d6 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare double @llvm.fabs.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 1f72ec65588ea..bb2a6ba8e3483 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
declare double @llvm.trunc.f64(double) nounwind readnone
declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 19e592f50beaf..4e50f995d27e7 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
diff --git a/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/test/CodeGen/AMDGPU/global-smrd-unknown.ll
new file mode 100644
index 0000000000000..8a576e6480a11
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-smrd-unknown.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}unknown_memdep_analysis:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+; GCN: flat_store_dword
+define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg) #0 {
+bb:
+ %tmp53 = load float, float addrspace(1)* undef, align 4
+ %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31
+ %tmp55 = load float, float addrspace(1)* %tmp54, align 4
+ %tmp56 = tail call float @llvm.fmuladd.f32(float undef, float %tmp53, float %tmp55)
+ store float %tmp56, float addrspace(1)* undef, align 4
+ ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index 41ae5a4a0b00b..43745d4b3da3d 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; half args should be promoted to float for SI and lower.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
@@ -471,10 +471,10 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out,
; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
+; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]]
; GCN-DAG: buffer_store_dword [[PACKED]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index c2668a077b098..8cda01a10f765 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; Use a 64-bit value with lo bits that can be represented as an inline constant
; GCN-LABEL: {{^}}i64_imm_inline_lo:
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index cd3502baee7be..fe86a58729681 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
; FIXME: Merge into imm.ll
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
@@ -305,7 +305,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
; VI-DAG: buffer_load_dword
; VI-NOT: and
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: buffer_store_dword
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 0d20c32a4770c..62200b988bea3 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index c0f5218efc16b..75826d530cb04 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -222,9 +222,9 @@ entry:
; FIXME: Should be scheduled to shrink vcc
; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1
; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
+; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc
define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
entry:
%val0 = load volatile i1, i1 addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 5cd965d2fa9c3..eea26192ed322 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GatherAllAliases gives up on trying to analyze cases where the
; pointer may have been loaded from an aliased store, so make sure
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index f08d4b6c79156..06dc2cc8b90e1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.fabs.f16(half %a)
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 1fcdac537fba6..f71b9752e9a10 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 2cc63ae74bf10..1b3e09a81e5a0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
; FIXME: Enable for VI.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index fe211d356070c..7068f45590551 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 593c95856811e..871b8c4f99b99 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
; GCN: v_bfe_i32
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index 495e36b09f8fa..39370e41e8aa9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index e0cec2134e70c..8468aa3a7b3ef 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 92e3a1099da0a..68fd08f778c43 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
; GCN: v_bfe_u32
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 0604a49372a2b..071f2a6de4cd0 100644
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.ceil.f16(half %a)
declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index d836ea36ef632..8931de63e74ba 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.cos.f16(half %a)
declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
@@ -29,8 +29,8 @@ entry:
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -48,8 +48,8 @@ entry:
; GCN-NOT: and
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @cos_v2f16(
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 5757142b9e954..4e96a76197160 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.exp2.f16(half %a)
declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 6a18141d8035e..74d1e694ffbe2 100644
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.floor.f16(half %a)
declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 3f4fba7d8ead0..a379b18ffb8b6 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.fma.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -128,7 +128,7 @@ define amdgpu_kernel void @fma_f16_imm_c(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16(
@@ -167,7 +167,7 @@ define amdgpu_kernel void @fma_v2f16(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_a(
@@ -210,7 +210,7 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_b(
@@ -253,7 +253,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_c(
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 806723e5136ca..2d4fe08d8bde8 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
-; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -13,11 +13,11 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
+; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
; SI: buffer_store_short v[[R_F16]]
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
+; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
; VI-FLUSH: buffer_store_short v[[C_F16]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
@@ -110,19 +110,19 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -131,7 +131,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
; VI-DENORM-NOT: v_and_b32
-; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
+; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index 773eb55283e44..277195c532086 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.log2.f16(half %a)
declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 8f4b314ffabb2..c72716439a761 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.maxnum.f16(half %a, half %b)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -9,9 +9,9 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_f16(
@@ -73,18 +73,18 @@ entry:
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -115,7 +115,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_a(
@@ -143,7 +143,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 1a86286f7136c..0e93acc27dc5b 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.minnum.f16(half %a, half %b)
declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -9,9 +9,9 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_f16(
@@ -72,18 +72,18 @@ entry:
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -116,7 +116,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16_imm_a(
@@ -144,7 +144,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 30cb969a76e5a..92282083984bc 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
declare half @llvm.rint.f16(half %a)
declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -34,12 +34,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index ffe87977870ba..7e29147571f2f 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -12,7 +12,7 @@
; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
-; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
; GCN: buffer_store_dword [[RESULT]]
; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
@@ -70,7 +70,7 @@ define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x floa
; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5
; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]]
-; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
; GFX89: buffer_store_short [[RESULT]]
define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
%x.arg.trunc = trunc i32 %x.arg to i16
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index eb1f32c981f88..08b9d9d873b49 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.sin.f16(half %a)
declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
@@ -29,9 +29,9 @@ entry:
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
@@ -47,10 +47,10 @@ entry:
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 46ee6526aca2f..0e1358ecca226 100644
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.sqrt.f16(half %a)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index dc7182aa0d89a..37ee4e92c6379 100644
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.trunc.f16(half %a)
declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
index bd6fea587b42f..77557a584093f 100644
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_f32:
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
index 5b772e1fe5ee3..84214b7dbc106 100644
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_f64:
; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index e3415b9c47dec..cb2495d5fdcf7 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
index 5df32c1e3120a..6360d39666c77 100644
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i32:
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
index de16b6c8997ef..c71db0b7357cd 100644
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i64:
; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index fc0cbf916b529..3fe6bd26be14f 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
index d6162c388b5b1..f9ba6241fe067 100644
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}load_i24:
; SI: {{flat|buffer}}_load_ubyte
diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 74564f387edeb..e1a2af6c7ef90 100644
--- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,4 +1,5 @@
; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
@@ -21,6 +22,17 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)*
; OPT-NEXT: load i8
; OPT: getelementptr
; OPT-NEXT: store i8
+
+; WOPT-LABEL: @min_size_large_static_memcpy_caller0(
+; WOPT-NOT: call
+; WOPT: br label %load-store-loop
+; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
+; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
+; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
+; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
+; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
+; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
+; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
ret void
diff --git a/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
new file mode 100644
index 0000000000000..768acf35eeae3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
@@ -0,0 +1,227 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: cluster_add_addc
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
+name: cluster_add_addc
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
+...
+
+# GCN-LABEL: name: interleave_add64s
+# GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
+# GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
+# GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
+name: interleave_add64s
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: vgpr_32 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: sreg_64 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: sreg_64 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: sreg_64 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+ %4 = V_MOV_B32_e32 0, implicit %exec
+ %5 = V_MOV_B32_e32 0, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+
+ %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
+
+
+ %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
+ %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_mov_addc
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %2 = S_MOV_B64 0
+# GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
+name: cluster_mov_addc
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = S_MOV_B64 0
+ S_NOP 0, implicit def %vcc
+ %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
+...
+
+# GCN-LABEL: name: no_cluster_add_addc_diff_sgpr
+# GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %8 = S_MOV_B64 0
+# GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
+name: no_cluster_add_addc_diff_sgpr
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: sreg_64 }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %8 = S_MOV_B64 0
+ %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
+...
+# GCN-LABEL: name: cluster_sub_subb
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
+# GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
+name: cluster_sub_subb
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_cmp_cndmask
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
+name: cluster_cmp_cndmask
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_multi_use_cmp_cndmask
+# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+name: cluster_multi_use_cmp_cndmask
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+
+ %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+ %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_multi_use_cmp_cndmask2
+# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+# GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+name: cluster_multi_use_cmp_cndmask2
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+ %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+...
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index b855fc500c6b4..8a6bf853a7c6a 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -19,15 +19,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-SLOWFMAF-NOT: v_fma
; SI-DENORM-SLOWFMAF-NOT: v_mad
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
@@ -55,15 +55,15 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
-; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -99,11 +99,11 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
@@ -133,8 +133,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -167,9 +167,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -205,8 +205,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -238,9 +238,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -278,7 +278,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -313,8 +313,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -355,9 +355,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -395,13 +395,13 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
-; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -437,13 +437,13 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
-; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
@@ -479,21 +479,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]]
-; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
+; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]]
+; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
-; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
@@ -530,21 +530,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]]
-; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
+; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
-; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 8e0014911def8..77c35fac8b5de 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
-; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -199,7 +199,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
bb:
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 6bc40e82459bb..b78d65ae1e1a1 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -32,8 +32,8 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
-; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]]
; GCN: s_endpgm
define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index ffcdac03bc74c..6387c9ff6dfaf 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index dfd5b97fcc865..6b0ec483247cf 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
; This test is mostly to test DAG store merging, so disable the vectorizer.
; Run with devices with different unaligned load restrictions.
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index b23b21118aaa3..97666492e376f 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
declare i32 @llvm.amdgcn.workitem.id.x() readnone
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 57c50c9804e56..a0290789175d3 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
; mul24 and mad24 are affected
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 82c27f204a478..ba3ff0b08bc92 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -66,9 +66,9 @@
; FIXME: Why is this compare essentially repeated?
; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
; GCN: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index 8a7bf6db5b8d4..500e4cb3cc73e 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index eb082843fb829..8e6885c4fc5e8 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}or_v2i32:
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index f83eb56dc6edf..776b151e30170 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0
declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index ecb513cd80b6e..d8c7438e4d0da 100644
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
new file mode 100644
index 0000000000000..bac348aaed709
--- /dev/null
+++ b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
@@ -0,0 +1,162 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s
+#
+# See bug http://llvm.org/PR33524 for details of the problem being checked here
+# This test will provoke a subrange join (see annotations below) during simple register coalescing
+# Without a fix for PR33524 this causes an unreachable in SubRange Join
+#
+# GCN-DAG: undef %[[REG0:[0-9]+]].sub0 = COPY %sgpr5
+# GCN-DAG: undef %[[REG1:[0-9]+]].sub0 = COPY %sgpr2
+# GCN-DAG: %[[REG0]].sub1 = S_MOV_B32 1
+# GCN-DAG: %[[REG1]].sub1 = S_MOV_B32 1
+
+--- |
+ define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 {
+ ret void
+ }
+
+...
+---
+name: regcoal-subrange-join
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: vreg_128 }
+ - { id: 2, class: vreg_128 }
+ - { id: 3, class: vreg_128 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: sreg_32_xm0, preferred-register: '%8' }
+ - { id: 7, class: vreg_128 }
+ - { id: 8, class: sreg_32_xm0, preferred-register: '%6' }
+ - { id: 9, class: vreg_128 }
+ - { id: 10, class: sgpr_32 }
+ - { id: 11, class: sgpr_32 }
+ - { id: 12, class: sgpr_32 }
+ - { id: 13, class: sgpr_32 }
+ - { id: 14, class: sgpr_32 }
+ - { id: 15, class: sgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: sreg_32_xm0 }
+ - { id: 18, class: sreg_64 }
+ - { id: 19, class: sreg_32_xm0 }
+ - { id: 20, class: sreg_32_xm0 }
+ - { id: 21, class: sreg_64 }
+ - { id: 22, class: sreg_32_xm0_xexec }
+ - { id: 23, class: sreg_32_xm0 }
+ - { id: 24, class: sreg_64_xexec }
+ - { id: 25, class: sreg_128 }
+ - { id: 26, class: sreg_64_xexec }
+ - { id: 27, class: sreg_32_xm0_xexec }
+ - { id: 28, class: sreg_32_xm0 }
+ - { id: 29, class: vgpr_32 }
+ - { id: 30, class: vgpr_32 }
+ - { id: 31, class: vgpr_32 }
+ - { id: 32, class: vgpr_32 }
+ - { id: 33, class: vgpr_32 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+ - { id: 36, class: vgpr_32 }
+ - { id: 37, class: vgpr_32 }
+ - { id: 38, class: sreg_128 }
+ - { id: 39, class: sreg_64_xexec }
+ - { id: 40, class: sreg_32_xm0_xexec }
+ - { id: 41, class: sreg_32_xm0 }
+ - { id: 42, class: vgpr_32 }
+ - { id: 43, class: vgpr_32 }
+ - { id: 44, class: vgpr_32 }
+ - { id: 45, class: vgpr_32 }
+ - { id: 46, class: vgpr_32 }
+ - { id: 47, class: vgpr_32 }
+ - { id: 48, class: vgpr_32 }
+ - { id: 49, class: vgpr_32 }
+ - { id: 50, class: vgpr_32 }
+ - { id: 51, class: sreg_128 }
+ - { id: 52, class: vgpr_32 }
+ - { id: 53, class: vgpr_32 }
+ - { id: 54, class: vgpr_32 }
+ - { id: 55, class: vgpr_32 }
+ - { id: 56, class: vreg_128 }
+ - { id: 57, class: vreg_128 }
+ - { id: 58, class: vreg_128 }
+ - { id: 59, class: sreg_32_xm0 }
+ - { id: 60, class: sreg_32_xm0 }
+ - { id: 61, class: vreg_128 }
+liveins:
+ - { reg: '%sgpr2', virtual-reg: '%12' }
+ - { reg: '%sgpr5', virtual-reg: '%15' }
+body: |
+ bb.0:
+ liveins: %sgpr2, %sgpr5
+
+ %15 = COPY killed %sgpr5
+ %12 = COPY killed %sgpr2
+ %17 = S_MOV_B32 1
+ undef %18.sub1 = COPY %17
+ %0 = COPY %18
+ %0.sub0 = COPY killed %12
+ %21 = COPY killed %18
+ %21.sub0 = COPY killed %15
+ %22 = S_LOAD_DWORD_IMM killed %21, 2, 0
+ %23 = S_MOV_B32 491436
+ undef %24.sub0 = COPY killed %22
+ %24.sub1 = COPY killed %23
+ %25 = S_LOAD_DWORDX4_IMM killed %24, 0, 0
+ %1 = COPY killed %25
+ %26 = S_LOAD_DWORDX2_IMM %0, 2, 0
+ dead %27 = S_LOAD_DWORD_IMM killed %26, 0, 0
+ S_CBRANCH_SCC0 %bb.1, implicit undef %scc
+
+ bb.5:
+ %58 = COPY killed %1
+ %59 = COPY killed %17
+ S_BRANCH %bb.2
+
+ bb.1:
+ %30 = V_MOV_B32_e32 1036831949, implicit %exec
+ %31 = V_ADD_F32_e32 %30, %1.sub3, implicit %exec
+ %33 = V_ADD_F32_e32 %30, %1.sub2, implicit %exec
+ %35 = V_ADD_F32_e32 %30, %1.sub1, implicit %exec
+ %37 = V_ADD_F32_e32 killed %30, killed %1.sub0, implicit %exec
+ undef %56.sub0 = COPY killed %37
+ %56.sub1 = COPY killed %35
+ %56.sub2 = COPY killed %33
+ %56.sub3 = COPY killed %31
+ %28 = S_MOV_B32 0
+ %2 = COPY killed %56
+ %58 = COPY killed %2
+ %59 = COPY killed %28
+
+ bb.2:
+ %4 = COPY killed %59
+ %3 = COPY killed %58
+ %39 = S_LOAD_DWORDX2_IMM killed %0, 6, 0
+ %40 = S_LOAD_DWORD_IMM killed %39, 0, 0
+ %43 = V_MOV_B32_e32 -1102263091, implicit %exec
+ %60 = COPY killed %4
+ %61 = COPY killed %3
+
+ bb.3:
+ successors: %bb.3, %bb.4
+
+ %7 = COPY killed %61
+ %6 = COPY killed %60
+ %8 = S_ADD_I32 killed %6, 1, implicit-def dead %scc
+ %44 = V_ADD_F32_e32 %43, %7.sub3, implicit %exec
+ %46 = V_ADD_F32_e32 %43, %7.sub2, implicit %exec
+ %48 = V_ADD_F32_e32 %43, %7.sub1, implicit %exec
+ %50 = V_ADD_F32_e32 %43, killed %7.sub0, implicit %exec
+ undef %57.sub0 = COPY killed %50
+ %57.sub1 = COPY killed %48
+ %57.sub2 = COPY %46
+ %57.sub3 = COPY killed %44
+ S_CMP_LT_I32 %8, %40, implicit-def %scc
+ %60 = COPY killed %8
+ %61 = COPY killed %57
+ S_CBRANCH_SCC1 %bb.3, implicit killed %scc
+ S_BRANCH %bb.4
+
+ bb.4:
+ EXP 32, undef %53, undef %54, killed %46, undef %55, 0, 0, 15, implicit %exec
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index ff4069226a62b..260b32ed3406c 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
; SI: buffer_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
index 266490718dd18..fa29d789cebee 100644
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotl_i64:
; BOTH-DAG: s_lshl_b64
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
index 9eda479cd25c2..af58b404ca6c6 100644
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotr_i64:
; BOTH-DAG: s_sub_i32
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index 9462683efe0e8..204eeb9983868 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.sqrt.f32(float) nounwind readnone
@@ -48,8 +48,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
-; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]]
+; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI-UNSAFE: buffer_store_dword [[RESULT]]
; SI-SAFE-NOT: v_rsq_f32
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index a131aaa3dfb4f..797fbc2712b0f 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}s_movk_i32_k0:
; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll
index f7a1c65881d02..ee56e9053fd3f 100644
--- a/test/CodeGen/AMDGPU/sad.ll
+++ b/test/CodeGen/AMDGPU/sad.ll
@@ -134,8 +134,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out,
; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
%icmp0 = icmp ugt i32 %a, %b
%sub0 = sub i32 %a, %b
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
index 586a455b2b91e..09e87d524419e 100644
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 6e1dd16383337..d5b2fa0b67540 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 62d0d93678858..0f09fa17423e6 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; XXX - Why the packing?
; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]],
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
-; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]]
; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 44d46086f02af..2dddba8bccc76 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FIXME: This currently doesn't do a great job of clustering the
; loads, which end up with extra moves between them. Right now, it
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 6b1e85915a110..4ae9871865f5e 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index abd15f1fb47f8..6ed730ad60f42 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -12,10 +12,8 @@
; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
-; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200
-; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]]
+; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
+; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index 7ec6ca809b685..305107f690fb8 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; The code generated by sdiv is long and complex and may frequently change.
; The goal of this test is to make sure the ISel doesn't fail.
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 0dc7cc309f7c9..0d181c2c34b85 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
; GCN-LABEL: {{^}}add_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
@@ -35,7 +35,7 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
; GCN-LABEL: {{^}}mul_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -68,9 +68,9 @@ entry:
; GCN-LABEL: {{^}}mul_v2i16:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
@@ -168,14 +168,14 @@ entry:
; GCN-LABEL: {{^}}mul_v2half:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_f16_sdwa
; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]]
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -362,9 +362,9 @@ entry:
; GCN-LABEL: {{^}}mac_v2half:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mac_f16_sdwa
; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -491,7 +491,7 @@ entry:
%tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
+
%arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
ret void
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 3417eb02b3614..e0619251f9204 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -103,7 +103,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -122,7 +122,7 @@ define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -154,7 +154,7 @@ define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -171,7 +171,7 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -191,7 +191,7 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -245,7 +245,7 @@ define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Z:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -266,8 +266,8 @@ define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[W:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]]
define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -291,7 +291,7 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
+; GCN-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[Z]], [[SELECT]]
; GCN: buffer_store_dword [[ADD]]
; GCN: buffer_store_dword [[NEG_X]]
@@ -316,8 +316,8 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c)
; GCN: buffer_load_dword [[W:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]]
define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -341,7 +341,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -359,7 +359,7 @@ define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -377,7 +377,7 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -397,7 +397,7 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -414,7 +414,7 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
; GCN: v_cmp_eq_u32_e64
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -431,7 +431,7 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
; GCN: v_cmp_eq_u32_e64
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -445,7 +445,7 @@ define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -462,7 +462,7 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -479,7 +479,7 @@ define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -497,7 +497,7 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -517,7 +517,7 @@ define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -540,7 +540,7 @@ define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -563,7 +563,7 @@ define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -585,7 +585,7 @@ define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -606,7 +606,7 @@ define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -628,7 +628,7 @@ define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index ebbc675b2babe..b77ebcf5bf529 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; Test expansion of scalar selects on vectors.
; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index 92ee2eb7f403f..e79ce3af0cf9d 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}select_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -104,8 +104,8 @@ entry:
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
+; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -134,8 +134,8 @@ entry:
; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
+; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -159,16 +159,16 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e64
; SI: v_cmp_lt_f32_e32
; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e64
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
-; VI: v_cmp_lt_f16_e64
; VI: v_cmp_lt_f16_e32
-; VI: v_cndmask_b32_e64
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_lt_f16_e32
; VI: v_cndmask_b32_e32
; GCN: s_endpgm
@@ -196,13 +196,17 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI-DAG: v_cmp_gt_f32_e64
-; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5
-; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_gt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; SI: v_cmp_lt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_f32_e32
+; SI: v_cndmask_b32_e32
+
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f16_e32
+; VI: v_cndmask_b32_e32
+
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; GCN: s_endpgm
@@ -228,13 +232,16 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI-DAG: v_cmp_lt_f32_e64
-; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5
-; VI: v_cmp_gt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; SI: v_cmp_gt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
+
+; VI: v_cmp_gt_f16_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
@@ -263,8 +270,8 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cmp_nlt_f32_e32
-; SI: v_cmp_nlt_f32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_nlt_f32_e32
; SI: v_cndmask_b32_e32
; VI: v_cmp_nlt_f16_e32
@@ -298,13 +305,17 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e64
+
; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32
; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32
-; GCN: v_cndmask_b32
+; VI: v_cndmask_b32
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32
+
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index 8d455d84bf9e7..bcaa1aa54c15f 100644
--- a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -7,7 +7,7 @@
; GCN: buffer_load_dword [[B:v[0-9]+]]
; GCN: buffer_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @multi_use_fneg_src() #0 {
@@ -30,7 +30,7 @@ define amdgpu_kernel void @multi_use_fneg_src() #0 {
; GCN: buffer_load_dword [[B:v[0-9]+]]
; GCN: buffer_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
@@ -78,7 +78,7 @@ define amdgpu_kernel void @multi_use_fneg() #0 {
; GCN: buffer_load_dword [[A:v[0-9]+]]
; GCN: buffer_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
; GCN: buffer_store_dword [[MUL1]]
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index f63719d62a847..a3bf167e756af 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -7,8 +7,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; GCN-DAG: v_cmp_eq_u32_e32
-; GCN-DAG: v_cmp_eq_u32_e64
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
%result = icmp eq <2 x i32> %a, %b
%sext = sext <2 x i1> %result to <2 x i32>
@@ -23,9 +23,9 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e64
-; GCN: v_cmp_eq_u32_e64
-; GCN: v_cmp_eq_u32_e64
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 160fb6a038fed..5b4d9ed259b60 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: i16 promotion pass ruins the scalar cases when legal.
; FIXME: r600 fails verifier
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index fb0bbaa9cbf27..8250bad7b0a10 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
; Copy VGPR -> SGPR used twice as an instruction operand, which is then
; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 931051102cd5c..3b24cf82d783b 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}phi1:
; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 4f7b61adc91d5..2f9eed457ab6d 100644
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Extract the high bit of the 1st quarter
; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
@@ -98,7 +98,7 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128
; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
+; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[ELT1PART]], v[[SHLLO]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index c70eb9b9c4a53..670287ba79373 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 5306e190a4f9c..f3faa39c64e68 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index edc313ee323bd..13ac9140b8273 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 6248d8a46daf6..767118eb8d118 100644
--- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -6,92 +6,7 @@
# that the post-RA run does manage to shrink it, but right now the
# resume crashes
---- |
- define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = sub i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = sub i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- declare i32 @llvm.amdgcn.workitem.id.x() #1
-
- attributes #0 = { nounwind }
- attributes #1 = { nounwind readnone }
-
...
----
# GCN-LABEL: name: shrink_add_vop3{{$}}
# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
@@ -151,13 +66,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -166,11 +81,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -235,13 +150,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -250,11 +165,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -319,13 +234,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -334,11 +249,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -402,13 +317,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -417,18 +332,18 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%9 = S_MOV_B64 0
%29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
---
# GCN-LABEL: name: shrink_addc_vop3{{$}}
-# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit %vcc, implicit %exec
# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
name: shrink_addc_vop3
@@ -487,13 +402,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -502,19 +417,19 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%vcc = S_MOV_B64 0
%29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
---
# GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
-# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
+# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit undef %vcc, implicit %exec
# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
name: shrink_addc_undef_vcc
alignment: 0
@@ -572,13 +487,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -587,11 +502,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 348c7200c0bc1..17109187d5387 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 3e452c214e983..c80945f390bed 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
; GCN: v_cndmask_b32_e64
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll
index 574d1c0b2c78e..0bcef99df39f6 100644
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}sitofp_i16_to_f16
; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index 827d672022eba..41430715f3476 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}s_abs_i32:
; GCN: s_abs_i32
@@ -18,7 +18,7 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind
; FUNC-LABEL: {{^}}v_abs_i32:
; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]]
; GCN: v_add_i32
; EG: MAX_INT
@@ -34,7 +34,7 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
; GCN-LABEL: {{^}}v_abs_i32_repeat_user:
; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
+; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
%val = load i32, i32 addrspace(1)* %src, align 4
@@ -71,8 +71,8 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
; GCN: v_add_i32
; GCN: v_add_i32
@@ -132,10 +132,10 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]]
; GCN: v_add_i32
; GCN: v_add_i32
@@ -184,8 +184,8 @@ define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(
; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
+; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
%val0 = load volatile i32, i32 addrspace(1)* %ptr0
%val1 = load volatile i32, i32 addrspace(1)* %ptr1
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index a9aac2d8abb75..27263429650d8 100644
--- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
; GCN-LABEL: {{^}}s_abs_v2i16:
; GFX9: s_load_dword [[VAL:s[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll
index 1ca0919258a8e..cbf9f37e29ef7 100644
--- a/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
; Inline spiller can decide to move a spill as early as possible in the basic block.
; It will skip phis and label, but we also need to make sure it skips instructions
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 44cfdf6398aef..74618b263bad7 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
index e067258920892..51eaf9a960b00 100644
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s
define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index cb40ecf2de1ca..8878b45385556 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
index 135632343f909..d65c2adc7e202 100644
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index 1d407ea9bcda6..14bedceed6eee 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -5,7 +5,7 @@
; GCN-LABEL: {{^}}v_test_sub_i16:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_short [[ADD]]
define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -68,7 +68,7 @@ define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -88,7 +88,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
+; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index e7655df155204..46f1b120f2127 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
index ee923e2b8b611..8d5c8b64efb83 100644
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -6,7 +6,7 @@
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
-; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
; VI-NOT: shl
-; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; VI-NOT: and
; VI-NOT: shl
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
@@ -201,8 +201,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
-; VI-DAG: v_subrev_u16_e32
-; VI-DAG: v_subrev_u16_e32
+; VI: v_sub_u16_e32
+; VI: v_sub_u16_e32
; VI: buffer_store_dwordx4
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
@@ -228,8 +228,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: v_subrev_u16_e32
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
+; VI: v_sub_u16_e32
; VI: buffer_store_dwordx2
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -253,7 +253,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
diff --git a/test/CodeGen/AMDGPU/syncscopes.ll b/test/CodeGen/AMDGPU/syncscopes.ll
new file mode 100644
index 0000000000000..3741ce788993e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/syncscopes.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-before=si-debugger-insert-nops < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: name: syncscopes
+; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+define void @syncscopes(
+ i32 %agent,
+ i32 addrspace(4)* %agent_out,
+ i32 %workgroup,
+ i32 addrspace(4)* %workgroup_out,
+ i32 %wavefront,
+ i32 addrspace(4)* %wavefront_out) {
+entry:
+ store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4
+ store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4
+ store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index f90040385f753..77a6820713d6d 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
; CHECK: buffer_load_dword v
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index 0c91d52df0c08..da038f4b05972 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index 632ccaa7e6124..5754bd9bb913a 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
; GCN: s_add_u32
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_uaddo_i32_novcc:
-; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
; EG: ADDC_UINT
; EG: ADD_INT
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index d9dab0d40acf6..1d683776bfd5a 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv_i32:
; EG-NOT: SETGE_INT
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll
index 0c3b0fcaf8549..eaa1d073cafb4 100644
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}uitofp_i16_to_f16
; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index fb4eab43a2d66..823c918dcda70 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; The code generated by urem is long and complex and may frequently
; change. The goal of this test is to make sure the ISel doesn't fail
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
index d1f454f0bc655..f01bf498e0d8a 100644
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_usubo_i64_zext:
; GCN: s_sub_u32
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_usubo_i32_novcc:
-; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
; EG-DAG: SUBB_UINT
; EG-DAG: SUB_INT
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_usubo_i16:
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
; VI: v_cmp_gt_u16_e32
define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index d4a68a418ee41..5cbfae34e1bb5 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -200,9 +200,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %
; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
-; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
-; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
-; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
+; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
+; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
+; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
@@ -292,10 +292,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs
; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
; GCN: load_dword
; GCN: load_ubyte
-; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v
+; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v
-; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
+; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
; GCN: store_byte
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 2b96f7d50076a..da57155f33ef1 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
; GCN-LABEL: {{^}}mac_vvv:
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN: buffer_store_dword [[C]]
define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
@@ -135,7 +135,7 @@ entry:
; GCN-LABEL: {{^}}safe_mad_sub0_src0:
; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
-; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
+; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index ce4a69db35060..46c9b7ee1a3d5 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}mac_f16:
; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
@@ -8,10 +8,10 @@
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
+; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
; SI: buffer_store_short v[[R_F16]]
-; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
+; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
; VI: buffer_store_short v[[C_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16(
@@ -147,9 +147,9 @@ entry:
; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
+; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
+; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
half addrspace(1)* %r,
@@ -171,9 +171,9 @@ entry:
; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
+; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
+; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
half addrspace(1)* %r,
@@ -312,20 +312,20 @@ entry:
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
+; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
-; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; VI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -481,14 +481,14 @@ entry:
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
@@ -513,14 +513,14 @@ entry:
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll
index 90cf34e609f6e..381ff5b1b518a 100644
--- a/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir
new file mode 100644
index 0000000000000..f8a2339626cf1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+
+ define amdgpu_kernel void @fold_fi_vgpr() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_vgpr_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_sgpr_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_fi_sgpr() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_fi_imm() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_imm_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+...
+# GCN-LABEL: name: fold_fi_vgpr{{$}}
+# GCN: %1 = IMPLICIT_DEF
+
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_fi_vgpr
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_vgpr_fi{{$}}
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_vgpr_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_sgpr_fi{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec
+name: fold_sgpr_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_fi_sgpr{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec
+name: fold_fi_sgpr
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+...
+# TODO: Should probably prefer folding immediate first
+# GCN-LABEL: name: fold_fi_imm{{$}}
+# GCN: %1 = V_MOV_B32_e32 999, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_fi_imm
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = V_MOV_B32_e32 999, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_imm_fi{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 999, %0, implicit-def %vcc, implicit %exec
+name: fold_imm_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = V_MOV_B32_e32 999, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir
new file mode 100644
index 0000000000000..b4c0c93347c20
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir
@@ -0,0 +1,40 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+...
+# GCN-LABEL: name: fold_imm_non_ssa{{$}}
+# GCN: %0 = V_MOV_B32_e32 123, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 456, %0, implicit-def %vcc, implicit %exec
+
+name: fold_imm_non_ssa
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+body: |
+ bb.0:
+ %0 = COPY undef %0
+ %0 = V_MOV_B32_e32 123, implicit %exec
+ %1 = V_MOV_B32_e32 456, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_partially_defined_superreg{{$}}
+# GCN: %1 = V_MOV_B32_e32 456, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 123, %1, implicit-def %vcc, implicit %exec
+name: fold_partially_defined_superreg
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vreg_64 }
+body: |
+ bb.0:
+ undef %3.sub0 = V_MOV_B32_e32 123, implicit %exec, implicit-def %3
+ %1 = V_MOV_B32_e32 456, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %3.sub0, %1, implicit %exec
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index bb6234729f90b..02ffd30be5fda 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -7,7 +7,9 @@
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-; SI: v_cndmask_b32_e64
+; SI: v_cmp_gt_i32_e32 vcc
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_i32_e32 vcc
; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
@@ -25,8 +27,11 @@ entry:
; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e32
+
+; SI: v_cmp_neq_f32_e32 vcc
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_neq_f32_e32 vcc
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
entry:
@@ -45,12 +50,10 @@ entry:
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-; FIXME: The shrinking does not happen on tonga
-
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
entry:
@@ -68,6 +71,10 @@ entry:
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
entry:
%0 = load <4 x float>, <4 x float> addrspace(1)* %in0
diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir
index 44dbd38f2d300..5612c7cac00b7 100644
--- a/test/CodeGen/AMDGPU/waitcnt-permute.mir
+++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir
@@ -1,18 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
---- |
- define float @waitcnt-permute(i32 %x, i32 %y) {
- entry:
- %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y)
- %1 = bitcast i32 %0 to float
- %2 = fadd float 1.000000e+00, %1
- ret float %2
- }
-
- declare i32 @llvm.amdgcn.ds.bpermute(i32, i32)
-
...
----
# CHECK-LABEL: name: waitcnt-permute{{$}}
# CHECK: DS_BPERMUTE_B32
# CHECK-NEXT: S_WAITCNT 127
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 57a082a0170c3..847a1d7393215 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}xor_v2i32:
@@ -60,7 +60,7 @@ define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)*
; FUNC-LABEL: {{^}}v_xor_i1:
; SI: buffer_load_ubyte [[B:v[0-9]+]]
; SI: buffer_load_ubyte [[A:v[0-9]+]]
-; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
; SI: buffer_store_byte [[RESULT]]
define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index a902234898cd0..69c42afb9ad5a 100644
--- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -6,7 +6,7 @@
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
@@ -26,7 +26,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
; GCN-NOT: v[[HI]]
; GCN-NOT: _or_
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
index 9dcfe5007c006..ed5255bfbebd4 100644
--- a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
+++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
@@ -6,23 +6,23 @@
; CHECK: ** List Scheduling
; CHECK: SU(2){{.*}}STR{{.*}}Volatile
-; CHECK-NOT: ord SU
-; CHECK: ord SU(3): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(3): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: SU(3){{.*}}LDR{{.*}}Volatile
-; CHECK-NOT: ord SU
-; CHECK: ord SU(2): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(2): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: Successors:
; CHECK: ** List Scheduling
; CHECK: SU(2){{.*}}STR{{.*}}
-; CHECK-NOT: ord SU
-; CHECK: ord SU(3): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(3): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: SU(3){{.*}}LDR{{.*}}
-; CHECK-NOT: ord SU
-; CHECK: ord SU(2): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(2): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: Successors:
define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind {
entry:
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
index 111375ece51ba..6c8bc7123a1ab 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
@@ -10,6 +10,46 @@
define void @test_icmp_sge_s32() { ret void }
define void @test_icmp_slt_s32() { ret void }
define void @test_icmp_sle_s32() { ret void }
+
+ define void @test_fcmp_true_s32() #0 { ret void }
+ define void @test_fcmp_false_s32() #0 { ret void }
+
+ define void @test_fcmp_oeq_s32() #0 { ret void }
+ define void @test_fcmp_ogt_s32() #0 { ret void }
+ define void @test_fcmp_oge_s32() #0 { ret void }
+ define void @test_fcmp_olt_s32() #0 { ret void }
+ define void @test_fcmp_ole_s32() #0 { ret void }
+ define void @test_fcmp_ord_s32() #0 { ret void }
+ define void @test_fcmp_ugt_s32() #0 { ret void }
+ define void @test_fcmp_uge_s32() #0 { ret void }
+ define void @test_fcmp_ult_s32() #0 { ret void }
+ define void @test_fcmp_ule_s32() #0 { ret void }
+ define void @test_fcmp_une_s32() #0 { ret void }
+ define void @test_fcmp_uno_s32() #0 { ret void }
+
+ define void @test_fcmp_one_s32() #0 { ret void }
+ define void @test_fcmp_ueq_s32() #0 { ret void }
+
+ define void @test_fcmp_true_s64() #0 { ret void }
+ define void @test_fcmp_false_s64() #0 { ret void }
+
+ define void @test_fcmp_oeq_s64() #0 { ret void }
+ define void @test_fcmp_ogt_s64() #0 { ret void }
+ define void @test_fcmp_oge_s64() #0 { ret void }
+ define void @test_fcmp_olt_s64() #0 { ret void }
+ define void @test_fcmp_ole_s64() #0 { ret void }
+ define void @test_fcmp_ord_s64() #0 { ret void }
+ define void @test_fcmp_ugt_s64() #0 { ret void }
+ define void @test_fcmp_uge_s64() #0 { ret void }
+ define void @test_fcmp_ult_s64() #0 { ret void }
+ define void @test_fcmp_ule_s64() #0 { ret void }
+ define void @test_fcmp_une_s64() #0 { ret void }
+ define void @test_fcmp_uno_s64() #0 { ret void }
+
+ define void @test_fcmp_one_s64() #0 { ret void }
+ define void @test_fcmp_ueq_s64() #0 { ret void }
+
+ attributes #0 = { "target-features"="+vfp2" }
...
---
name: test_icmp_eq_s32
@@ -35,8 +75,8 @@ body: |
%2(s1) = G_ICMP intpred(eq), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -71,8 +111,8 @@ body: |
%2(s1) = G_ICMP intpred(ne), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -107,8 +147,8 @@ body: |
%2(s1) = G_ICMP intpred(ugt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -143,8 +183,8 @@ body: |
%2(s1) = G_ICMP intpred(uge), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -179,8 +219,8 @@ body: |
%2(s1) = G_ICMP intpred(ult), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -215,8 +255,8 @@ body: |
%2(s1) = G_ICMP intpred(ule), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -251,8 +291,8 @@ body: |
%2(s1) = G_ICMP intpred(sgt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -287,8 +327,8 @@ body: |
%2(s1) = G_ICMP intpred(sge), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -323,8 +363,8 @@ body: |
%2(s1) = G_ICMP intpred(slt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -359,8 +399,1180 @@ body: |
%2(s1) = G_ICMP intpred(sle), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s32
+# CHECK-LABEL: name: test_fcmp_true_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(true), %0(s32), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s32
+# CHECK-LABEL: name: test_fcmp_false_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(false), %0(s32), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s32
+# CHECK-LABEL: name: test_fcmp_oeq_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s32
+# CHECK-LABEL: name: test_fcmp_ogt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s32
+# CHECK-LABEL: name: test_fcmp_oge_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(oge), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s32
+# CHECK-LABEL: name: test_fcmp_olt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(olt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s32
+# CHECK-LABEL: name: test_fcmp_ole_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ole), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s32
+# CHECK-LABEL: name: test_fcmp_ord_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ord), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s32
+# CHECK-LABEL: name: test_fcmp_ugt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s32
+# CHECK-LABEL: name: test_fcmp_uge_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(uge), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s32
+# CHECK-LABEL: name: test_fcmp_ult_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ult), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s32
+# CHECK-LABEL: name: test_fcmp_ule_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ule), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s32
+# CHECK-LABEL: name: test_fcmp_une_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(une), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s32
+# CHECK-LABEL: name: test_fcmp_uno_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(uno), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s32
+# CHECK-LABEL: name: test_fcmp_ueq_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s64
+# CHECK-LABEL: name: test_fcmp_true_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(true), %0(s64), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s64
+# CHECK-LABEL: name: test_fcmp_false_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(false), %0(s64), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s64
+# CHECK-LABEL: name: test_fcmp_oeq_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(oeq), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s64
+# CHECK-LABEL: name: test_fcmp_ogt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ogt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s64
+# CHECK-LABEL: name: test_fcmp_oge_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(oge), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s64
+# CHECK-LABEL: name: test_fcmp_olt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(olt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s64
+# CHECK-LABEL: name: test_fcmp_ole_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ole), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s64
+# CHECK-LABEL: name: test_fcmp_ord_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ord), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s64
+# CHECK-LABEL: name: test_fcmp_uge_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(uge), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s64
+# CHECK-LABEL: name: test_fcmp_ult_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ult), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s64
+# CHECK-LABEL: name: test_fcmp_ule_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ule), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s64
+# CHECK-LABEL: name: test_fcmp_une_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(une), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s64
+# CHECK-LABEL: name: test_fcmp_uno_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(uno), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s64
+# CHECK-LABEL: name: test_fcmp_one_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(one), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s64
+# CHECK-LABEL: name: test_fcmp_ueq_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ueq), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
index 7d021fdb43dd9..98b39e444ac77 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
@@ -49,3 +49,33 @@ define arm_aapcscc double @test_add_double(double %x, double %y) {
%r = fadd double %x, %y
ret double %r
}
+
+define arm_aapcs_vfpcc i32 @test_cmp_float_ogt(float %x, float %y) {
+; CHECK-LABEL: test_cmp_float_ogt
+; HARD: vcmp.f32
+; HARD: vmrs APSR_nzcv, fpscr
+; HARD-NEXT: movgt
+; SOFT-AEABI: blx __aeabi_fcmpgt
+; SOFT-DEFAULT: blx __gtsf2
+entry:
+ %v = fcmp ogt float %x, %y
+ %r = zext i1 %v to i32
+ ret i32 %r
+}
+
+define arm_aapcs_vfpcc i32 @test_cmp_float_one(float %x, float %y) {
+; CHECK-LABEL: test_cmp_float_one
+; HARD: vcmp.f32
+; HARD: vmrs APSR_nzcv, fpscr
+; HARD: movgt
+; HARD-NOT: vcmp
+; HARD: movmi
+; SOFT-AEABI-DAG: blx __aeabi_fcmpgt
+; SOFT-AEABI-DAG: blx __aeabi_fcmplt
+; SOFT-DEFAULT-DAG: blx __gtsf2
+; SOFT-DEFAULT-DAG: blx __ltsf2
+entry:
+ %v = fcmp one float %x, %y
+ %r = zext i1 %v to i32
+ ret i32 %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
index c93e7fa0ec560..9a0877846fc3e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -36,6 +36,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -44,6 +45,7 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
%2(s32) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -70,6 +72,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -78,6 +81,7 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
%2(s32) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -106,6 +110,7 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -114,7 +119,9 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_SDIV
%2(s16) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s16)
@@ -143,6 +150,7 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -151,7 +159,9 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_UDIV
%2(s16) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s16)
@@ -180,6 +190,7 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -188,7 +199,9 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_SDIV
%2(s8) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s8)
@@ -217,6 +230,7 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -225,7 +239,9 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_UDIV
%2(s8) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s8)
@@ -254,6 +270,7 @@ body: |
; HWDIV: [[Q:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+ ; SOFT-NOT: G_SREM
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -262,6 +279,7 @@ body: |
; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SREM
%2(s32) = G_SREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -290,6 +308,7 @@ body: |
; HWDIV: [[Q:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+ ; SOFT-NOT: G_UREM
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -298,6 +317,7 @@ body: |
; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UREM
%2(s32) = G_UREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
index 803135ba595e4..cb61f95b10ce9 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
@@ -10,6 +10,44 @@
define void @test_fadd_float() { ret void }
define void @test_fadd_double() { ret void }
+
+ define void @test_fcmp_true_s32() { ret void }
+ define void @test_fcmp_false_s32() { ret void }
+
+ define void @test_fcmp_oeq_s32() { ret void }
+ define void @test_fcmp_ogt_s32() { ret void }
+ define void @test_fcmp_oge_s32() { ret void }
+ define void @test_fcmp_olt_s32() { ret void }
+ define void @test_fcmp_ole_s32() { ret void }
+ define void @test_fcmp_ord_s32() { ret void }
+ define void @test_fcmp_ugt_s32() { ret void }
+ define void @test_fcmp_uge_s32() { ret void }
+ define void @test_fcmp_ult_s32() { ret void }
+ define void @test_fcmp_ule_s32() { ret void }
+ define void @test_fcmp_une_s32() { ret void }
+ define void @test_fcmp_uno_s32() { ret void }
+
+ define void @test_fcmp_one_s32() { ret void }
+ define void @test_fcmp_ueq_s32() { ret void }
+
+ define void @test_fcmp_true_s64() { ret void }
+ define void @test_fcmp_false_s64() { ret void }
+
+ define void @test_fcmp_oeq_s64() { ret void }
+ define void @test_fcmp_ogt_s64() { ret void }
+ define void @test_fcmp_oge_s64() { ret void }
+ define void @test_fcmp_olt_s64() { ret void }
+ define void @test_fcmp_ole_s64() { ret void }
+ define void @test_fcmp_ord_s64() { ret void }
+ define void @test_fcmp_ugt_s64() { ret void }
+ define void @test_fcmp_uge_s64() { ret void }
+ define void @test_fcmp_ult_s64() { ret void }
+ define void @test_fcmp_ule_s64() { ret void }
+ define void @test_fcmp_une_s64() { ret void }
+ define void @test_fcmp_uno_s64() { ret void }
+
+ define void @test_fcmp_one_s64() { ret void }
+ define void @test_fcmp_ueq_s64() { ret void }
...
---
name: test_frem_float
@@ -31,6 +69,7 @@ body: |
; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
%0(s32) = COPY %r0
%1(s32) = COPY %r1
+ ; CHECK-NOT: G_FREM
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -41,6 +80,7 @@ body: |
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; HARD: [[R:%[0-9]+]](s32) = COPY %s0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FREM
%2(s32) = G_FREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -86,6 +126,7 @@ body: |
; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]]
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+ ; CHECK-NOT: G_FREM
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -96,6 +137,7 @@ body: |
; SOFT: BLX $fmod, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; HARD: BLX $fmod, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FREM
%6(s64) = G_FREM %4, %5
%7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64)
%r0 = COPY %7(s32)
@@ -122,6 +164,7 @@ body: |
; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
%0(s32) = COPY %r0
%1(s32) = COPY %r1
+ ; CHECK-NOT: G_FPOW
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -132,6 +175,7 @@ body: |
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; HARD: [[R:%[0-9]+]](s32) = COPY %s0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FPOW
%2(s32) = G_FPOW %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -177,6 +221,7 @@ body: |
; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]]
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+ ; CHECK-NOT: G_FPOW
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -187,6 +232,7 @@ body: |
; SOFT: BLX $pow, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; HARD: BLX $pow, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FPOW
%6(s64) = G_FPOW %4, %5
%7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64)
%r0 = COPY %7(s32)
@@ -214,6 +260,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HARD: [[R:%[0-9]+]](s32) = G_FADD [[X]], [[Y]]
+ ; SOFT-NOT: G_FADD
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -221,6 +268,7 @@ body: |
; SOFT-DEFAULT: BLX $__addsf3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_FADD
%2(s32) = G_FADD %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -261,6 +309,7 @@ body: |
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
; HARD: [[R:%[0-9]+]](s64) = G_FADD [[X]], [[Y]]
+ ; SOFT-NOT: G_FADD
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -269,6 +318,7 @@ body: |
; SOFT-AEABI: BLX $__aeabi_dadd, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; SOFT-DEFAULT: BLX $__adddf3, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_FADD
%6(s64) = G_FADD %4, %5
; HARD-DAG: G_UNMERGE_VALUES [[R]](s64)
%7(s32),%8(s32) = G_UNMERGE_VALUES %6(s64)
@@ -276,3 +326,1565 @@ body: |
%r1 = COPY %8(s32)
BX_RET 14, _, implicit %r0, implicit %r1
...
+---
+name: test_fcmp_true_s32
+# CHECK-LABEL: name: test_fcmp_true_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(true), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s32
+# CHECK-LABEL: name: test_fcmp_false_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(false), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s32
+# CHECK-LABEL: name: test_fcmp_oeq_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s32
+# CHECK-LABEL: name: test_fcmp_ogt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s32
+# CHECK-LABEL: name: test_fcmp_oge_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(oge), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s32
+# CHECK-LABEL: name: test_fcmp_olt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(olt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s32
+# CHECK-LABEL: name: test_fcmp_ole_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ole), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s32
+# CHECK-LABEL: name: test_fcmp_ord_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ord), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s32
+# CHECK-LABEL: name: test_fcmp_ugt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s32
+# CHECK-LABEL: name: test_fcmp_uge_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(uge), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s32
+# CHECK-LABEL: name: test_fcmp_ult_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ult), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s32
+# CHECK-LABEL: name: test_fcmp_ule_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ule), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s32
+# CHECK-LABEL: name: test_fcmp_une_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(une), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__nesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s32
+# CHECK-LABEL: name: test_fcmp_uno_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(uno), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]]
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s32
+# CHECK-LABEL: name: test_fcmp_ueq_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]]
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s64
+# CHECK-LABEL: name: test_fcmp_true_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(true), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s64
+# CHECK-LABEL: name: test_fcmp_false_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(false), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s64
+# CHECK-LABEL: name: test_fcmp_oeq_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(oeq), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s64
+# CHECK-LABEL: name: test_fcmp_ogt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ogt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s64
+# CHECK-LABEL: name: test_fcmp_oge_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(oge), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s64
+# CHECK-LABEL: name: test_fcmp_olt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(olt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s64
+# CHECK-LABEL: name: test_fcmp_ole_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ole), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s64
+# CHECK-LABEL: name: test_fcmp_ord_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ord), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ugt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s64
+# CHECK-LABEL: name: test_fcmp_uge_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(uge), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s64
+# CHECK-LABEL: name: test_fcmp_ult_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ult), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s64
+# CHECK-LABEL: name: test_fcmp_ule_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ule), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s64
+# CHECK-LABEL: name: test_fcmp_une_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(une), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__nedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s64
+# CHECK-LABEL: name: test_fcmp_uno_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(uno), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s64
+# CHECK-LABEL: name: test_fcmp_one_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(one), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s64
+# CHECK-LABEL: name: test_fcmp_ueq_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ueq), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index bf759728c3658..4575341dfc290 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -111,6 +111,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_ADD %0, %1
; G_ADD with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -136,6 +137,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_ADD %0, %1
; G_ADD with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -187,6 +189,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_SUB %0, %1
; G_SUB with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -212,6 +215,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_SUB %0, %1
; G_SUB with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -263,6 +267,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_MUL %0, %1
; G_MUL with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -288,6 +293,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_MUL %0, %1
; G_MUL with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -339,6 +345,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_AND %0, %1
; G_AND with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -364,6 +371,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_AND %0, %1
; G_AND with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -415,6 +423,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_OR %0, %1
; G_OR with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -440,6 +449,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_OR %0, %1
; G_OR with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -491,6 +501,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_XOR %0, %1
; G_XOR with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -516,6 +527,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_XOR %0, %1
; G_XOR with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -689,11 +701,32 @@ selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
body: |
bb.0:
%0(s32) = G_CONSTANT 42
; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 42
+ %1(s16) = G_CONSTANT i16 21
+ ; CHECK-NOT: G_CONSTANT i16
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 21
+ ; CHECK: {{%[0-9]+}}(s16) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i16
+
+ %2(s8) = G_CONSTANT i8 10
+ ; CHECK-NOT: G_CONSTANT i8
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 10
+ ; CHECK: {{%[0-9]+}}(s8) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i8
+
+ %3(s1) = G_CONSTANT i1 1
+ ; CHECK-NOT: G_CONSTANT i1
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; CHECK: {{%[0-9]+}}(s1) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i1
+
%r0 = COPY %0(s32)
BX_RET 14, _, implicit %r0
...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index d3b93e488ef47..ffca431d96ea1 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -35,6 +35,8 @@
define void @test_trunc_s32_16() { ret void }
define void @test_icmp_eq_s32() { ret void }
+ define void @test_fcmp_one_s32() #0 { ret void }
+ define void @test_fcmp_ugt_s64() #0 { ret void }
define void @test_select_s32() { ret void }
@@ -743,6 +745,62 @@ body: |
...
---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: fprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %r0 = COPY %3(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: fprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+ %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %r0 = COPY %3(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
name: test_select_s32
# CHECK-LABEL: name: test_select_s32
legalized: true
diff --git a/test/CodeGen/ARM/arguments-nosplit-double.ll b/test/CodeGen/ARM/arguments-nosplit-double.ll
index 8e4dee45ddf27..bb3710842d34c 100644
--- a/test/CodeGen/ARM/arguments-nosplit-double.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-double.ll
@@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, double %b) {
ret i32 %tmp
}
+; CHECK-LABEL: f:
; CHECK-NOT: r3
diff --git a/test/CodeGen/ARM/arguments-nosplit-i64.ll b/test/CodeGen/ARM/arguments-nosplit-i64.ll
index 4a08d0a0406ac..02bdc6cc227a0 100644
--- a/test/CodeGen/ARM/arguments-nosplit-i64.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-i64.ll
@@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, i64 %b) {
ret i32 %tmp
}
+; CHECK-LABEL: f:
; CHECK-NOT: r3
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
index d54848a6bcf19..0ae2d5f6f2f2b 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
@@ -13,13 +13,13 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=4
define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
%1 = load i32, i32* @a, align 4
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
index 9cb076651f5b3..bc7a14b1028ef 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
@@ -8,9 +8,9 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 3
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
define i32 @foo(i32* %a) nounwind optsize {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
index 774b0a907e399..67cddc14d0475 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
@@ -10,7 +10,7 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2
; CHECK: Successors
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
index e234e179ed071..372b2e2f5dc99 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -11,7 +11,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMULS read-advanced latency to VMLAS = 0
; CHECK-SAME: Latency=0
@@ -20,7 +20,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMLAS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS read-advanced latency to the next VMLAS = 4
; CHECK-SAME: Latency=4
@@ -28,7 +28,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; CHECK-FAST: VFMAS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -50,7 +50,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; VMULfd read-advanced latency to VMLAfd = 0
; CHECK-SAME: Latency=0
@@ -59,7 +59,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMLAfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAfd read-advanced latency to the next VMLAfd = 4
; CHECK-SAME: Latency=4
@@ -67,7 +67,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; CHECK-FAST: VFMAfd
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9
@@ -88,7 +88,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMULS read-advanced latency to VMLSS = 0
; CHECK-SAME: Latency=0
@@ -97,7 +97,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMLSS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSS read-advanced latency to the next VMLSS = 4
; CHECK-SAME: Latency=4
@@ -105,7 +105,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; CHECK-FAST: VFMSS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -127,7 +127,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; VMULfd read-advanced latency to VMLSfd = 0
; CHECK-SAME: Latency=0
@@ -136,7 +136,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMLSfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSfd read-advanced latency to the next VMLSfd = 4
; CHECK-SAME: Latency=4
@@ -144,7 +144,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; CHECK-FAST: VFMSfd
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9
@@ -165,7 +165,7 @@ define float @Test5(float %f1, float %f2, float %f3) {
; CHECK-FAST: VFNMS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -184,7 +184,7 @@ define float @Test6(float %f1, float %f2, float %f3) {
; CHECK-FAST: VFNMA
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
index 6cfa823fb9694..b5edcc3042293 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@@ -13,15 +13,15 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=6
define i32 @bar(i32* %iptr) minsize optsize {
%1 = load double, double* @a, align 8
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
index 218b5b41a7e43..12c7b3270c3b3 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
@@ -8,11 +8,11 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=6
define double @foo(double* %a) nounwind optsize {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
index af1c469d44432..05c498eee49f7 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
@@ -9,7 +9,7 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
@a = global double 0.0, align 4
diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll
index ec032ccac423c..536b6cc7c9d01 100644
--- a/test/CodeGen/ARM/fence-singlethread.ll
+++ b/test/CodeGen/ARM/fence-singlethread.ll
@@ -11,6 +11,6 @@ define void @fence_singlethread() {
; CHECK: @ COMPILER BARRIER
; CHECK-NOT: dmb
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
diff --git a/test/CodeGen/ARM/ror.ll b/test/CodeGen/ARM/ror.ll
new file mode 100644
index 0000000000000..0f699a8dd29d6
--- /dev/null
+++ b/test/CodeGen/ARM/ror.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
+
+; rotr (rotr x, 4), 6 -> rotr x, 10 -> ror r0, r0, #10
+define i32 @test1(i32 %x) nounwind readnone {
+; CHECK-LABEL: test1:
+; CHECK: ror r0, r0, #10
+; CHECK: bx lr
+entry:
+ %high_part.i = shl i32 %x, 28
+ %low_part.i = lshr i32 %x, 4
+ %result.i = or i32 %high_part.i, %low_part.i
+ %high_part.i.1 = shl i32 %result.i, 26
+ %low_part.i.2 = lshr i32 %result.i, 6
+ %result.i.3 = or i32 %low_part.i.2, %high_part.i.1
+ ret i32 %result.i.3
+}
+
+; the same vector test
+define <2 x i32> @test2(<2 x i32> %x) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK: ror r0, r0, #10
+; CHECK: ror r1, r1, #10
+; CHECK: bx lr
+entry:
+ %high_part.i = shl <2 x i32> %x, <i32 28, i32 28>
+ %low_part.i = lshr <2 x i32> %x, <i32 4, i32 4>
+ %result.i = or <2 x i32> %high_part.i, %low_part.i
+ %high_part.i.1 = shl <2 x i32> %result.i, <i32 26, i32 26>
+ %low_part.i.2 = lshr <2 x i32> %result.i, <i32 6, i32 6>
+ %result.i.3 = or <2 x i32> %low_part.i.2, %high_part.i.1
+ ret <2 x i32> %result.i.3
+}
+
diff --git a/test/CodeGen/ARM/scavenging.mir b/test/CodeGen/ARM/scavenging.mir
new file mode 100644
index 0000000000000..09040a3bd217e
--- /dev/null
+++ b/test/CodeGen/ARM/scavenging.mir
@@ -0,0 +1,66 @@
+# RUN: llc -o - %s -mtriple=arm-arm-none-eabi -mcpu=cortex-m0 -run-pass scavenger-test | FileCheck %s
+---
+# CHECK-LABEL: name: scavengebug0
+# Make sure we are not spilling/using a physreg used in the very last
+# instruction of the scavenging range.
+# CHECK-NOT: tSTRi {{.*}}%r0,{{.*}}%r0
+# CHECK-NOT: tSTRi {{.*}}%r1,{{.*}}%r1
+# CHECK-NOT: tSTRi {{.*}}%r2,{{.*}}%r2
+# CHECK-NOT: tSTRi {{.*}}%r3,{{.*}}%r3
+# CHECK-NOT: tSTRi {{.*}}%r4,{{.*}}%r4
+# CHECK-NOT: tSTRi {{.*}}%r5,{{.*}}%r5
+# CHECK-NOT: tSTRi {{.*}}%r6,{{.*}}%r6
+# CHECK-NOT: tSTRi {{.*}}%r7,{{.*}}%r7
+name: scavengebug0
+body: |
+ bb.0:
+ ; Bring up register pressure to force emergency spilling
+ %r0 = IMPLICIT_DEF
+ %r1 = IMPLICIT_DEF
+ %r2 = IMPLICIT_DEF
+ %r3 = IMPLICIT_DEF
+ %r4 = IMPLICIT_DEF
+ %r5 = IMPLICIT_DEF
+ %r6 = IMPLICIT_DEF
+ %r7 = IMPLICIT_DEF
+
+ %0 : tgpr = IMPLICIT_DEF
+ %0 = tADDhirr %0, %sp, 14, _
+ tSTRi %r0, %0, 0, 14, _
+
+ %1 : tgpr = IMPLICIT_DEF
+ %1 = tADDhirr %1, %sp, 14, _
+ tSTRi %r1, %1, 0, 14, _
+
+ %2 : tgpr = IMPLICIT_DEF
+ %2 = tADDhirr %2, %sp, 14, _
+ tSTRi %r2, %2, 0, 14, _
+
+ %3 : tgpr = IMPLICIT_DEF
+ %3 = tADDhirr %3, %sp, 14, _
+ tSTRi %r3, %3, 0, 14, _
+
+ %4 : tgpr = IMPLICIT_DEF
+ %4 = tADDhirr %4, %sp, 14, _
+ tSTRi %r4, %4, 0, 14, _
+
+ %5 : tgpr = IMPLICIT_DEF
+ %5 = tADDhirr %5, %sp, 14, _
+ tSTRi %r5, %5, 0, 14, _
+
+ %6 : tgpr = IMPLICIT_DEF
+ %6 = tADDhirr %6, %sp, 14, _
+ tSTRi %r6, %6, 0, 14, _
+
+ %7 : tgpr = IMPLICIT_DEF
+ %7 = tADDhirr %7, %sp, 14, _
+ tSTRi %r7, %7, 0, 14, _
+
+ KILL %r0
+ KILL %r1
+ KILL %r2
+ KILL %r3
+ KILL %r4
+ KILL %r5
+ KILL %r6
+ KILL %r7
diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll
new file mode 100644
index 0000000000000..d6f07f6535763
--- /dev/null
+++ b/test/CodeGen/AVR/branch-relaxation.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECKC-LABEL: relax_breq
+; CHECK: cpi r{{[0-9]+}}, 0
+; CHECK: brne LBB0_1
+; CHECK: rjmp LBB0_2
+; LBB0_1:
+
+define i8 @relax_breq(i1 %a) {
+entry-block:
+ br i1 %a, label %hello, label %finished
+
+hello:
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ br label %finished
+finished:
+ ret i8 3
+}
+
+; CHECKC-LABEL: no_relax_breq
+; CHECK: cpi r{{[0-9]+}}, 0
+; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]]
+; CHECK: nop
+; ...
+; LBB0_1:
+define i8 @no_relax_breq(i1 %a) {
+entry-block:
+ br i1 %a, label %hello, label %finished
+
+hello:
+ ; There are not enough NOPs to require relaxation.
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ br label %finished
+finished:
+ ret i8 3
+}
+
diff --git a/test/CodeGen/AVR/ctlz.ll b/test/CodeGen/AVR/ctlz.ll
index 4f73e846b1f19..8659550baf908 100644
--- a/test/CodeGen/AVR/ctlz.ll
+++ b/test/CodeGen/AVR/ctlz.ll
@@ -10,7 +10,8 @@ declare i8 @llvm.ctlz.i8(i8)
; CHECK-LABEL: count_leading_zeros:
; CHECK: cpi [[RESULT:r[0-9]+]], 0
-; CHECK: breq LBB0_1
+; CHECK: brne LBB0_1
+; CHECK: rjmp LBB0_2
; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]]
; CHECK: lsr {{.*}}[[SCRATCH]]
; CHECK: or {{.*}}[[SCRATCH]], {{.*}}[[RESULT]]
@@ -43,6 +44,6 @@ declare i8 @llvm.ctlz.i8(i8)
; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: andi {{.*}}[[RESULT]], 15
; CHECK: ret
-; CHECK: LBB0_1:
+; CHECK: LBB0_2:
; CHECK: ldi {{.*}}[[RESULT]], 8
; CHECK: ret
diff --git a/test/CodeGen/AVR/cttz.ll b/test/CodeGen/AVR/cttz.ll
index 2501566275ea0..02d36954f5264 100644
--- a/test/CodeGen/AVR/cttz.ll
+++ b/test/CodeGen/AVR/cttz.ll
@@ -10,7 +10,7 @@ declare i8 @llvm.cttz.i8(i8)
; CHECK-LABEL: count_trailing_zeros:
; CHECK: cpi [[RESULT:r[0-9]+]], 0
-; CHECK: breq LBB0_1
+; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]]
; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]]
; CHECK: dec {{.*}}[[SCRATCH]]
; CHECK: com {{.*}}[[RESULT]]
@@ -34,7 +34,7 @@ declare i8 @llvm.cttz.i8(i8)
; CHECK: andi {{.*}}[[SCRATCH]], 15
; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: ret
-; CHECK: LBB0_1:
+; CHECK: [[END_BB]]:
; CHECK: ldi {{.*}}[[SCRATCH]], 8
; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: ret
diff --git a/test/CodeGen/AVR/frmidx-iterator-bug.ll b/test/CodeGen/AVR/frmidx-iterator-bug.ll
new file mode 100644
index 0000000000000..f9e2f0688fafb
--- /dev/null
+++ b/test/CodeGen/AVR/frmidx-iterator-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=avr -mattr=avr6 | FileCheck %s
+
+%str_slice = type { i8*, i16 }
+%Machine = type { i16, [0 x i8], i16, [0 x i8], [16 x i8], [0 x i8] }
+
+; CHECK-LABEL: step
+define void @step(%Machine*) {
+ ret void
+}
+
+; CHECK-LABEL: main
+define void @main() {
+start:
+ %machine = alloca %Machine, align 8
+ %v0 = bitcast %Machine* %machine to i8*
+ %v1 = getelementptr inbounds %Machine, %Machine* %machine, i16 0, i32 2
+ %v2 = load i16, i16* %v1, align 2
+ br label %bb2.i5
+
+bb2.i5:
+ %v18 = load volatile i8, i8* inttoptr (i16 77 to i8*), align 1
+ %v19 = icmp sgt i8 %v18, -1
+ br i1 %v19, label %bb2.i5, label %bb.exit6
+
+bb.exit6:
+ %v20 = load volatile i8, i8* inttoptr (i16 78 to i8*), align 2
+ br label %bb7
+
+bb7:
+ call void @step(%Machine* %machine)
+ br label %bb7
+}
+
diff --git a/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll
new file mode 100644
index 0000000000000..17ac29e2cdb86
--- /dev/null
+++ b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mattr=lpm,lpmw < %s -march=avr | FileCheck %s
+
+declare void @callback(i16 zeroext)
+
+; CHECK-LABEL: foo
+define void @foo() {
+entry:
+ ; CHECK: ldi r{{[0-9]+}}, pm_lo8(callback)
+ ; CHECK-NEXT: ldi r{{[0-9]+}}, pm_hi8(callback)
+ call void @bar(i8 zeroext undef, void (i16)* @callback)
+ ret void
+}
+
+declare void @bar(i8 zeroext, void (i16)*)
+
diff --git a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
index bcea4e6dfe271..4d58c85f4f232 100644
--- a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
@@ -17,8 +17,8 @@ body: |
; CHECK-LABEL: test_andiwrdrr
- ; CHECK: %r20 = ANDIRdK %r20, 175, implicit-def dead %sreg
- ; CHECK-NEXT: %r21 = ANDIRdK %r21, 250, implicit-def %sreg
+ ; CHECK: %r16 = ANDIRdK %r16, 175, implicit-def dead %sreg
+ ; CHECK-NEXT: %r17 = ANDIRdK %r17, 250, implicit-def %sreg
- %r21r20 = ANDIWRdK %r17r16, 64175, implicit-def %sreg
+ %r17r16 = ANDIWRdK %r17r16, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/COMWRd.mir b/test/CodeGen/AVR/pseudo/COMWRd.mir
index 58ff7af7cb3c6..db68a4082b735 100644
--- a/test/CodeGen/AVR/pseudo/COMWRd.mir
+++ b/test/CodeGen/AVR/pseudo/COMWRd.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r14 = COMRd %r14, implicit-def dead %sreg
; CHECK-NEXT: %r15 = COMRd %r15, implicit-def %sreg
- %r15r14 = COMWRd %r9r8, implicit-def %sreg
+ %r15r14 = COMWRd %r15r14, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/ORIWRdK.mir b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
index d77a6ba884881..eaa12842df428 100644
--- a/test/CodeGen/AVR/pseudo/ORIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = ORIRdK %r20, 175, implicit-def dead %sreg
; CHECK-NEXT: %r21 = ORIRdK %r21, 250, implicit-def %sreg
- %r21r20 = ORIWRdK %r17r16, 64175, implicit-def %sreg
+ %r21r20 = ORIWRdK %r21r20, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
index 644e6106ee790..a92f6951798bf 100644
--- a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = SBCIRdK %r20, 175, implicit-def %sreg, implicit killed %sreg
; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
- %r21r20 = SBCIWRdK %r17r16, 64175, implicit-def %sreg, implicit %sreg
+ %r21r20 = SBCIWRdK %r21r20, 64175, implicit-def %sreg, implicit %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
index c7d88d7ab3f68..38ff880a51720 100644
--- a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = SUBIRdK %r20, 175, implicit-def %sreg
; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
- %r21r20 = SUBIWRdK %r17r16, 64175, implicit-def %sreg
+ %r21r20 = SUBIWRdK %r21r20, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll
index ca7ec1ab831ce..aca9502b5dfb7 100644
--- a/test/CodeGen/AVR/select-mbb-placement-bug.ll
+++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll
@@ -8,9 +8,9 @@ define internal fastcc void @loopy() {
;
; https://github.com/avr-rust/rust/issues/49
-; CHECK: LBB0_1:
-; CHECK: LBB0_2:
-; CHECK-NOT: LBB0_3:
+; CHECK: LBB0_{{[0-9]+}}:
+; CHECK: LBB0_{{[0-9]+}}:
+; CHECK-NOT: LBB0_{{[0-9]+}}:
start:
br label %bb7.preheader
diff --git a/test/CodeGen/BPF/undef.ll b/test/CodeGen/BPF/undef.ll
index de14bfde1ab97..8d8a5f429514f 100644
--- a/test/CodeGen/BPF/undef.ll
+++ b/test/CodeGen/BPF/undef.ll
@@ -1,4 +1,5 @@
-; RUN: not llc < %s -march=bpf | FileCheck %s
+; RUN: not llc < %s -march=bpfel | FileCheck -check-prefixes=CHECK,EL %s
+; RUN: not llc < %s -march=bpfeb | FileCheck -check-prefixes=CHECK,EB %s
%struct.bpf_map_def = type { i32, i32, i32, i32 }
%struct.__sk_buff = type opaque
@@ -13,36 +14,31 @@
; Function Attrs: nounwind uwtable
define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
-; CHECK: r2 = r10
-; CHECK: r2 += -2
-; CHECK: r1 = 0
-; CHECK: *(u16 *)(r2 + 6) = r1
-; CHECK: *(u16 *)(r2 + 4) = r1
-; CHECK: *(u16 *)(r2 + 2) = r1
-; CHECK: r2 = 6
-; CHECK: *(u8 *)(r10 - 7) = r2
-; CHECK: r2 = 5
-; CHECK: *(u8 *)(r10 - 8) = r2
-; CHECK: r2 = 7
-; CHECK: *(u8 *)(r10 - 6) = r2
-; CHECK: r2 = 8
-; CHECK: *(u8 *)(r10 - 5) = r2
-; CHECK: r2 = 9
-; CHECK: *(u8 *)(r10 - 4) = r2
-; CHECK: r2 = 10
-; CHECK: *(u8 *)(r10 - 3) = r2
-; CHECK: *(u16 *)(r10 + 24) = r1
-; CHECK: *(u16 *)(r10 + 22) = r1
-; CHECK: *(u16 *)(r10 + 20) = r1
-; CHECK: *(u16 *)(r10 + 18) = r1
-; CHECK: *(u16 *)(r10 + 16) = r1
-; CHECK: *(u16 *)(r10 + 14) = r1
-; CHECK: *(u16 *)(r10 + 12) = r1
-; CHECK: *(u16 *)(r10 + 10) = r1
-; CHECK: *(u16 *)(r10 + 8) = r1
-; CHECK: *(u16 *)(r10 + 6) = r1
-; CHECK: *(u16 *)(r10 - 2) = r1
-; CHECK: *(u16 *)(r10 + 26) = r1
+; CHECK: r1 = r10
+; CHECK: r1 += -2
+; CHECK: r2 = 0
+; CHECK: *(u16 *)(r1 + 6) = r2
+; CHECK: *(u16 *)(r1 + 4) = r2
+; CHECK: *(u16 *)(r1 + 2) = r2
+; EL: r1 = 134678021
+; EB: r1 = 84281096
+; CHECK: *(u32 *)(r10 - 8) = r1
+; CHECK: r1 = 9
+; CHECK: *(u8 *)(r10 - 4) = r1
+; CHECK: r1 = 10
+; CHECK: *(u8 *)(r10 - 3) = r1
+; CHECK: *(u16 *)(r10 + 24) = r2
+; CHECK: *(u16 *)(r10 + 22) = r2
+; CHECK: *(u16 *)(r10 + 20) = r2
+; CHECK: *(u16 *)(r10 + 18) = r2
+; CHECK: *(u16 *)(r10 + 16) = r2
+; CHECK: *(u16 *)(r10 + 14) = r2
+; CHECK: *(u16 *)(r10 + 12) = r2
+; CHECK: *(u16 *)(r10 + 10) = r2
+; CHECK: *(u16 *)(r10 + 8) = r2
+; CHECK: *(u16 *)(r10 + 6) = r2
+; CHECK: *(u16 *)(r10 - 2) = r2
+; CHECK: *(u16 *)(r10 + 26) = r2
; CHECK: r2 = r10
; CHECK: r2 += -8
; CHECK: r1 = <MCOperand Expr:(routing)>ll
diff --git a/test/CodeGen/Generic/pr33094.ll b/test/CodeGen/Generic/pr33094.ll
new file mode 100644
index 0000000000000..afa464f63f663
--- /dev/null
+++ b/test/CodeGen/Generic/pr33094.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s
+
+; PR33094
+; Make sure that a constant extractvalue doesn't cause a crash in
+; SelectionDAGBuilder::visitExtractValue.
+
+%A = type {}
+%B = type {}
+%Tuple = type { i64 }
+
+@A_Inst = global %A zeroinitializer
+@B_Inst = global %B zeroinitializer
+
+define i64 @foo() {
+ ret i64 extractvalue (%Tuple select (i1 icmp eq
+ (%B* bitcast (%A* @A_Inst to %B*), %B* @B_Inst),
+ %Tuple { i64 33 }, %Tuple { i64 42 }), 0)
+}
diff --git a/test/CodeGen/Hexagon/convertdptoint.ll b/test/CodeGen/Hexagon/convertdptoint.ll
index a09c2fd14b120..adf76e5dc82eb 100644
--- a/test/CodeGen/Hexagon/convertdptoint.ll
+++ b/test/CodeGen/Hexagon/convertdptoint.ll
@@ -12,10 +12,10 @@ entry:
%b = alloca double, align 8
%c = alloca double, align 8
store i32 0, i32* %retval
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
%2 = load double, double* %c, align 8
diff --git a/test/CodeGen/Hexagon/convertdptoll.ll b/test/CodeGen/Hexagon/convertdptoll.ll
index f46d46cf76b18..6b5bf56a248bc 100644
--- a/test/CodeGen/Hexagon/convertdptoll.ll
+++ b/test/CodeGen/Hexagon/convertdptoll.ll
@@ -17,8 +17,8 @@ entry:
%0 = load double, double* %a, align 8
%1 = load double, double* %b, align 8
%add = fadd double %0, %1
- store double %add, double* %c, align 8
- %2 = load double, double* %c, align 8
+ store volatile double %add, double* %c, align 8
+ %2 = load volatile double, double* %c, align 8
%conv = fptosi double %2 to i64
store i64 %conv, i64* %i, align 8
%3 = load i64, i64* %i, align 8
diff --git a/test/CodeGen/Hexagon/convertsptoint.ll b/test/CodeGen/Hexagon/convertsptoint.ll
index 7593e57d852f4..939b3b06a8c79 100644
--- a/test/CodeGen/Hexagon/convertsptoint.ll
+++ b/test/CodeGen/Hexagon/convertsptoint.ll
@@ -17,8 +17,8 @@ entry:
%0 = load float, float* %a, align 4
%1 = load float, float* %b, align 4
%add = fadd float %0, %1
- store float %add, float* %c, align 4
- %2 = load float, float* %c, align 4
+ store volatile float %add, float* %c, align 4
+ %2 = load volatile float, float* %c, align 4
%conv = fptosi float %2 to i32
store i32 %conv, i32* %i, align 4
%3 = load i32, i32* %i, align 4
diff --git a/test/CodeGen/Hexagon/convertsptoll.ll b/test/CodeGen/Hexagon/convertsptoll.ll
index d8432cbc812bc..f540397ccf5e5 100644
--- a/test/CodeGen/Hexagon/convertsptoll.ll
+++ b/test/CodeGen/Hexagon/convertsptoll.ll
@@ -17,8 +17,8 @@ entry:
%0 = load float, float* %a, align 4
%1 = load float, float* %b, align 4
%add = fadd float %0, %1
- store float %add, float* %c, align 4
- %2 = load float, float* %c, align 4
+ store volatile float %add, float* %c, align 4
+ %2 = load volatile float, float* %c, align 4
%conv = fptosi float %2 to i64
store i64 %conv, i64* %i, align 8
%3 = load i64, i64* %i, align 8
diff --git a/test/CodeGen/Hexagon/dadd.ll b/test/CodeGen/Hexagon/dadd.ll
index 5fcd705bab232..3068f499d12df 100644
--- a/test/CodeGen/Hexagon/dadd.ll
+++ b/test/CodeGen/Hexagon/dadd.ll
@@ -9,10 +9,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/dmul.ll b/test/CodeGen/Hexagon/dmul.ll
index 1b79e0aa7d701..a6cf62b0c0aae 100644
--- a/test/CodeGen/Hexagon/dmul.ll
+++ b/test/CodeGen/Hexagon/dmul.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %b, align 8
- %1 = load double, double* %a, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %b, align 8
+ %1 = load volatile double, double* %a, align 8
%mul = fmul double %0, %1
store double %mul, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
index 6bf8224904ec0..ccc287c5f2bcb 100644
--- a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
+++ b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
@@ -12,10 +12,10 @@ entry:
%b = alloca double, align 8
%c = alloca double, align 8
store i32 0, i32* %retval
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
%2 = load double, double* %c, align 8
diff --git a/test/CodeGen/Hexagon/dsub.ll b/test/CodeGen/Hexagon/dsub.ll
index 8b37301d84fbd..d7e44b307cf8d 100644
--- a/test/CodeGen/Hexagon/dsub.ll
+++ b/test/CodeGen/Hexagon/dsub.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %b, align 8
- %1 = load double, double* %a, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %b, align 8
+ %1 = load volatile double, double* %a, align 8
%sub = fsub double %0, %1
store double %sub, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/fadd.ll b/test/CodeGen/Hexagon/fadd.ll
index 0418c1724f5bd..65c6182dcc77f 100644
--- a/test/CodeGen/Hexagon/fadd.ll
+++ b/test/CodeGen/Hexagon/fadd.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %a, align 4
- %1 = load float, float* %b, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %a, align 4
+ %1 = load volatile float, float* %b, align 4
%add = fadd float %0, %1
store float %add, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/fmul.ll b/test/CodeGen/Hexagon/fmul.ll
index 552f98ec7a53a..e20e293c0a137 100644
--- a/test/CodeGen/Hexagon/fmul.ll
+++ b/test/CodeGen/Hexagon/fmul.ll
@@ -9,10 +9,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %b, align 4
- %1 = load float, float* %a, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %b, align 4
+ %1 = load volatile float, float* %a, align 4
%mul = fmul float %0, %1
store float %mul, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/fsub.ll b/test/CodeGen/Hexagon/fsub.ll
index d7b0e2f65b33c..e9a1fa3d192bc 100644
--- a/test/CodeGen/Hexagon/fsub.ll
+++ b/test/CodeGen/Hexagon/fsub.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %b, align 4
- %1 = load float, float* %a, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %b, align 4
+ %1 = load volatile float, float* %a, align 4
%sub = fsub float %0, %1
store float %sub, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/hasfp-crash1.ll b/test/CodeGen/Hexagon/hasfp-crash1.ll
new file mode 100644
index 0000000000000..1154a7117a70a
--- /dev/null
+++ b/test/CodeGen/Hexagon/hasfp-crash1.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Check that this testcase does not crash.
+; CHECK: jumpr r31
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+declare i32 @foo0(i32*, i32, i64, i32, i8 zeroext, i8 zeroext, i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @foo1(i32* %a0, i32 %a1, i32 %a2, i32 %a3, i8 zeroext %a4, i8 zeroext %a5, i32 %a6) local_unnamed_addr #0 !dbg !33 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 %a6, i64 0, metadata !51, metadata !52), !dbg !53
+ ret i32 undef, !dbg !54
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22)
+!1 = !DIFile(filename: "foo.i", directory: "/path")
+!2 = !{!3, !16}
+!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5)
+!4 = !DIFile(filename: "foo.h", directory: "/path")
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !DIEnumerator(name: "E0", value: 7)
+!7 = !DIEnumerator(name: "E1", value: 6)
+!8 = !DIEnumerator(name: "E2", value: 5)
+!9 = !DIEnumerator(name: "E3", value: 0)
+!10 = !DIEnumerator(name: "E4", value: 1)
+!11 = !DIEnumerator(name: "E5", value: 7)
+!12 = !DIEnumerator(name: "E6", value: 5)
+!13 = !DIEnumerator(name: "E7", value: 4)
+!14 = !DIEnumerator(name: "E8", value: 4)
+!15 = !DIEnumerator(name: "E9", value: 10)
+!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17)
+!17 = !{!18, !19, !20, !21}
+!18 = !DIEnumerator(name: "F0", value: 1)
+!19 = !DIEnumerator(name: "F1", value: 2)
+!20 = !DIEnumerator(name: "F2", value: 4)
+!21 = !DIEnumerator(name: "F3", value: 7)
+!22 = !{!23, !24, !25}
+!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24)
+!26 = !{i32 2, !"Debug Info Version", i32 3}
+!27 = !{i32 6, !"Linker Options", !28}
+!28 = !{!29, !30, !31, !32}
+!29 = !{!"foo0", !".text"}
+!30 = !{!"foo1", !".text"}
+!31 = !{!"foo2", !".text"}
+!32 = !{!"foo3", !".text"}
+!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44)
+!34 = !DIFile(filename: "foo.c", directory: "/path")
+!35 = !DISubroutineType(types: !36)
+!36 = !{!37, !38, !39, !40, !41, !42, !43, !37}
+!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32)
+!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23)
+!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23)
+!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23)
+!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3)
+!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16)
+!44 = !{!45, !46, !47, !48, !49, !50, !51}
+!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38)
+!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39)
+!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40)
+!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41)
+!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42)
+!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43)
+!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37)
+!52 = !DIExpression()
+!53 = !DILocation(line: 84, column: 169, scope: !33)
+!54 = !DILocation(line: 86, column: 5, scope: !33)
diff --git a/test/CodeGen/Hexagon/hasfp-crash2.ll b/test/CodeGen/Hexagon/hasfp-crash2.ll
new file mode 100644
index 0000000000000..c8b49948ce74e
--- /dev/null
+++ b/test/CodeGen/Hexagon/hasfp-crash2.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Check that this testcase does not crash.
+; CHECK: call foo0
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+declare void @foo0() local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define void @foo1() local_unnamed_addr #0 !dbg !33 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !51, metadata !52), !dbg !53
+ tail call void @foo0(), !dbg !54
+ ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22)
+!1 = !DIFile(filename: "foo.i", directory: "/path")
+!2 = !{!3, !16}
+!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5)
+!4 = !DIFile(filename: "foo.h", directory: "/path")
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !DIEnumerator(name: "E0", value: 7)
+!7 = !DIEnumerator(name: "E1", value: 6)
+!8 = !DIEnumerator(name: "E2", value: 5)
+!9 = !DIEnumerator(name: "E3", value: 0)
+!10 = !DIEnumerator(name: "E4", value: 1)
+!11 = !DIEnumerator(name: "E5", value: 7)
+!12 = !DIEnumerator(name: "E6", value: 5)
+!13 = !DIEnumerator(name: "E7", value: 4)
+!14 = !DIEnumerator(name: "E8", value: 4)
+!15 = !DIEnumerator(name: "E9", value: 10)
+!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17)
+!17 = !{!18, !19, !20, !21}
+!18 = !DIEnumerator(name: "F0", value: 1)
+!19 = !DIEnumerator(name: "F1", value: 2)
+!20 = !DIEnumerator(name: "F2", value: 4)
+!21 = !DIEnumerator(name: "F3", value: 7)
+!22 = !{!23, !24, !25}
+!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24)
+!26 = !{i32 2, !"Debug Info Version", i32 3}
+!27 = !{i32 6, !"Linker Options", !28}
+!28 = !{!29, !30, !31, !32}
+!29 = !{!"foo0", !".text"}
+!30 = !{!"foo1", !".text"}
+!31 = !{!"foo2", !".text"}
+!32 = !{!"foo3", !".text"}
+!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44)
+!34 = !DIFile(filename: "foo.c", directory: "/path")
+!35 = !DISubroutineType(types: !36)
+!36 = !{!37, !38, !39, !40, !41, !42, !43, !37}
+!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32)
+!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23)
+!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23)
+!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23)
+!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3)
+!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16)
+!44 = !{!45, !46, !47, !48, !49, !50, !51}
+!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38)
+!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39)
+!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40)
+!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41)
+!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42)
+!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43)
+!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37)
+!52 = !DIExpression()
+!53 = !DILocation(line: 84, column: 169, scope: !33)
+!54 = !DILocation(line: 86, column: 12, scope: !33)
diff --git a/test/CodeGen/Hexagon/hvx-nontemporal.ll b/test/CodeGen/Hexagon/hvx-nontemporal.ll
new file mode 100644
index 0000000000000..98c5ef4809b08
--- /dev/null
+++ b/test/CodeGen/Hexagon/hvx-nontemporal.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+target triple = "hexagon"
+
+; Function Attrs: norecurse nounwind
+define void @test(<32 x i32>* nocapture readonly %x, <32 x i32>* nocapture readnone %y, <32 x i32>* nocapture %a, <32 x i32>* nocapture %b) #0 {
+entry:
+; CHECK: v0 = vmem(r0+#7):nt
+ %add.ptr = getelementptr inbounds <32 x i32>, <32 x i32>* %x, i32 7
+ %0 = load <32 x i32>, <32 x i32>* %add.ptr, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: v1.cur = vmem(r2+#0):nt
+ %1 = load <32 x i32>, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: vmem(r3+#3):nt = v1
+ %add.ptr2 = getelementptr inbounds <32 x i32>, <32 x i32>* %b, i32 3
+ store <32 x i32> %1, <32 x i32>* %add.ptr2, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: vmem(r2+#0):nt = v0
+ store <32 x i32> %0, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{i32 1}
diff --git a/test/CodeGen/Hexagon/target-flag-ext.mir b/test/CodeGen/Hexagon/target-flag-ext.mir
new file mode 100644
index 0000000000000..49e0d2870e00f
--- /dev/null
+++ b/test/CodeGen/Hexagon/target-flag-ext.mir
@@ -0,0 +1,24 @@
+# RUN: llc -march=hexagon -run-pass hexagon-packetizer -o - %s | FileCheck %s
+---
+name: fred
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ ; Check that all these instructions go in the same packet. This is to
+ ; make sure that a target flag (other than HMOTF_ConstExtend) on an
+ ; operand will not be interpreted as a constant-extender flag.
+ ; The combination used below (pcrel + 0) does not technically make sense,
+ ; but combinations that do make sense require constant extending, so
+ ; testing this is not possible otherwise.
+
+ ; CHECK: BUNDLE
+ ; CHECK-DAG: %r0 = A2_tfrsi
+ ; CHECK-DAG: %r1 = A2_tfrsi
+ ; CHECK-DAG: %r2 = A2_tfrsi
+ ; CHECK: }
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ %r1 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ %r2 = A2_tfrsi target-flags (hexagon-pcrel) 0
+...
+
diff --git a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
index 1fe42a7314881..1c81f580bee53 100644
--- a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
+++ b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
@@ -14,7 +14,7 @@
# CHECK: %3(s16) = G_LOAD %0(p0) :: (load acquire 2)
# CHECK: G_STORE %3(s16), %0(p0) :: (store release 2)
# CHECK: G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
-# CHECK: G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+# CHECK: G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8)
name: atomic_memoperands
body: |
bb.0:
@@ -25,6 +25,6 @@ body: |
%3:_(s16) = G_LOAD %0(p0) :: (load acquire 2)
G_STORE %3(s16), %0(p0) :: (store release 2)
G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
- G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+ G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8)
RET_ReallyLR
...
diff --git a/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir
new file mode 100644
index 0000000000000..731d7165b9df9
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir
@@ -0,0 +1,19 @@
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define void @target_memoperands_error() {
+ ret void
+ }
+
+...
+---
+name: target_memoperands_error
+body: |
+ bb.0:
+
+ %0:_(p0) = COPY %x0
+ ; CHECK: [[@LINE+1]]:35: use of undefined target MMO flag 'aarch64-invalid'
+ %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-invalid" load 8)
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir
new file mode 100644
index 0000000000000..f853b551e0986
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir
@@ -0,0 +1,22 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
+
+--- |
+
+ define void @target_memoperands() {
+ ret void
+ }
+
+...
+---
+# CHECK-LABEL: name: target_memoperands
+# CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+# CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+name: target_memoperands
+body: |
+ bb.0:
+
+ %0:_(p0) = COPY %x0
+ %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+ G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
index 7cef01c9d12d9..c0251232fd5c7 100644
--- a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
+++ b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -171,8 +171,8 @@ body: |
# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
# CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
-# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 killed %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec
name: add_f32_1.0_multi_f16_use
@@ -307,8 +307,8 @@ body: |
# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
# CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec
-# CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %11, %14, implicit %exec
+# CHECK: %16 = V_ADD_F16_e32 %12, %14, implicit %exec
# CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
name: add_f32_1.0_one_f32_use_multi_f16_use
@@ -514,8 +514,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
# CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
-# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F32_e32 %12, %13, implicit %exec
name: add_f16_1.0_multi_f32_use
alignment: 0
@@ -581,8 +581,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
# CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
-# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
name: add_f16_1.0_other_high_bits_multi_f16_use
alignment: 0
@@ -648,8 +648,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
# CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
-# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
name: add_f16_1.0_other_high_bits_use_f16_f32
alignment: 0
exposesReturnsTwice: false
diff --git a/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/test/CodeGen/MIR/AMDGPU/syncscopes.mir
new file mode 100644
index 0000000000000..83506257d8bf8
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/syncscopes.mir
@@ -0,0 +1,98 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -run-pass=none %s -o - | FileCheck --check-prefix=GCN %s
+
+--- |
+ ; ModuleID = '<stdin>'
+ source_filename = "<stdin>"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+ target triple = "amdgcn-amd-amdhsa"
+
+ define void @syncscopes(i32 %agent, i32 addrspace(4)* %agent_out, i32 %workgroup, i32 addrspace(4)* %workgroup_out, i32 %wavefront, i32 addrspace(4)* %wavefront_out) #0 {
+ entry:
+ store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4
+ store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4
+ store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4
+ ret void
+ }
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.break(i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+ ; Function Attrs: convergent nounwind
+ declare i1 @llvm.amdgcn.loop(i64) #1
+
+ ; Function Attrs: convergent nounwind
+ declare void @llvm.amdgcn.end.cf(i64) #1
+
+ attributes #0 = { "target-cpu"="gfx803" }
+ attributes #1 = { convergent nounwind }
+ attributes #2 = { convergent nounwind readnone }
+
+# GCN-LABEL: name: syncscopes
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+...
+---
+name: syncscopes
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr4_sgpr5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr4_sgpr5
+
+ S_WAITCNT 0
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr6 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 24, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 16, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr8 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 32, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 127
+ %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr0_sgpr1
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 40, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr1, implicit %exec, implicit killed %sgpr0_sgpr1, implicit %sgpr0_sgpr1, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr6, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+ S_WAITCNT 112
+ %vgpr0 = V_MOV_B32_e32 %sgpr2, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr2_sgpr3
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr3, implicit %exec, implicit killed %sgpr2_sgpr3, implicit %sgpr2_sgpr3, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr7, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+ S_WAITCNT 112
+ %vgpr0 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr4_sgpr5
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit killed %sgpr4_sgpr5, implicit %sgpr4_sgpr5, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr8, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/MIR/AMDGPU/target-flags.mir b/test/CodeGen/MIR/AMDGPU/target-flags.mir
new file mode 100644
index 0000000000000..7d288dd1b0450
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/target-flags.mir
@@ -0,0 +1,29 @@
+# RUN: llc -march=amdgcn -run-pass none -o - %s | FileCheck %s
+--- |
+ define amdgpu_kernel void @flags() {
+ ret void
+ }
+
+ declare void @foo()
+...
+---
+
+# CHECK: SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc
+# CHECK: %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
+
+name: flags
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ maxAlignment: 8
+registers:
+ - { id: 0, class: sreg_64, preferred-register: '' }
+ - { id: 1, class: sreg_64, preferred-register: '' }
+body: |
+ bb.0:
+ liveins: %sgpr0_sgpr1
+ %0 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc
+ %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
+
+ S_ENDPGM
+...
diff --git a/test/CodeGen/MIR/Generic/runPass.mir b/test/CodeGen/MIR/Generic/runPass.mir
index 33380d4c6bb4a..54c1dd221bdb7 100644
--- a/test/CodeGen/MIR/Generic/runPass.mir
+++ b/test/CodeGen/MIR/Generic/runPass.mir
@@ -1,5 +1,6 @@
# RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s | FileCheck %s
# RUN: llc -run-pass=regallocbasic -debug-pass=Arguments -o - %s | FileCheck %s
+# RUN: llc -run-pass=regallocfast -debug-pass=Arguments -o - %s | FileCheck %s
# Check that passes are initialized correctly, so that it's possible to
# use -run-pass.
@@ -7,6 +8,7 @@
---
# CHECK: name: foo
name: foo
+tracksRegLiveness: true
body: |
bb.0:
...
diff --git a/test/CodeGen/MIR/Hexagon/target-flags.mir b/test/CodeGen/MIR/Hexagon/target-flags.mir
new file mode 100644
index 0000000000000..656e0a6ea8596
--- /dev/null
+++ b/test/CodeGen/MIR/Hexagon/target-flags.mir
@@ -0,0 +1,36 @@
+# RUN: llc -march=hexagon -run-pass none -o - %s | FileCheck %s
+---
+name: fred
+
+body: |
+ bb.0:
+
+ ; CHECK: target-flags(hexagon-pcrel)
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ ; CHECK: target-flags(hexagon-got)
+ %r0 = A2_tfrsi target-flags (hexagon-got) 0
+ ; CHECK: target-flags(hexagon-lo16)
+ %r0 = A2_tfrsi target-flags (hexagon-lo16) 0
+ ; CHECK: target-flags(hexagon-hi16)
+ %r0 = A2_tfrsi target-flags (hexagon-hi16) 0
+ ; CHECK: target-flags(hexagon-gprel)
+ %r0 = A2_tfrsi target-flags (hexagon-gprel) 0
+ ; CHECK: target-flags(hexagon-gdgot)
+ %r0 = A2_tfrsi target-flags (hexagon-gdgot) 0
+ ; CHECK: target-flags(hexagon-gdplt)
+ %r0 = A2_tfrsi target-flags (hexagon-gdplt) 0
+ ; CHECK: target-flags(hexagon-ie)
+ %r0 = A2_tfrsi target-flags (hexagon-ie) 0
+ ; CHECK: target-flags(hexagon-iegot)
+ %r0 = A2_tfrsi target-flags (hexagon-iegot) 0
+ ; CHECK: target-flags(hexagon-tprel)
+ %r0 = A2_tfrsi target-flags (hexagon-tprel) 0
+
+ ; CHECK: target-flags(hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-ext) 0
+ ; CHECK: target-flags(hexagon-pcrel, hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel,hexagon-ext) 0
+ ; CHECK: target-flags(hexagon-ie, hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-ie,hexagon-ext) 0
+...
+
diff --git a/test/CodeGen/MIR/X86/tied-physical-regs-match.mir b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir
new file mode 100644
index 0000000000000..1ddf649f76a7c
--- /dev/null
+++ b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the Machine Verifier detects tied physical registers
+# that doesn't match.
+
+--- |
+
+ define i32 @foo() {
+ entry:
+ ret i32 0
+ }
+
+...
+---
+name: foo
+body: |
+ bb.0.entry:
+ liveins: %rdi
+
+ ; CHECK: Tied physical registers must match.
+ %rbx = AND64rm killed %rdx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags
+ RETQ %rbx
+...
diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll
index 951002d60a037..14a799b91717d 100644
--- a/test/CodeGen/MSP430/Inst16mm.ll
+++ b/test/CodeGen/MSP430/Inst16mm.ll
@@ -64,6 +64,6 @@ entry:
%0 = load i16, i16* %retval ; <i16> [#uses=1]
ret i16 %0
; CHECK-LABEL: mov2:
-; CHECK: mov.w 0(r1), 4(r1)
-; CHECK: mov.w 2(r1), 6(r1)
+; CHECK-DAG: mov.w 2(r1), 6(r1)
+; CHECK-DAG: mov.w 0(r1), 4(r1)
}
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
index f522c6722ee6f..4298442157e23 100644
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
+; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR
; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
; llvm.mem* intrinsics get lowered to loops.
@@ -32,6 +33,23 @@ entry:
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL: @memcpy_caller
+; WIR: entry:
+; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR: loop-memcpy-expansion:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
}
define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -50,6 +68,23 @@ entry:
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL: @memcpy_volatile_caller
+; WIR: entry:
+; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR: loop-memcpy-expansion:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store volatile i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
}
define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -65,6 +100,32 @@ entry:
; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
; IR: getelementptr inbounds i8, i8* [[SRCCAST]]
; IR: getelementptr inbounds i8, i8* [[DSTCAST]]
+
+; WIR-LABEL: @memcpy_casting_caller
+; WIR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
+; WIR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
+; WIR: getelementptr inbounds i8, i8* [[SRCCAST]]
+; WIR: getelementptr inbounds i8, i8* [[DSTCAST]]
+}
+
+define i8* @memcpy_known_size(i8* %dst, i8* %src) {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i32 1, i1 false)
+ ret i8* %dst
+
+; Check that calls with compile-time constant size are handled correctly
+; WIR-LABEL: @memcpy_known_size
+; WIR: entry:
+; WIR: br label %load-store-loop
+; WIR: load-store-loop:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
+; WIR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split
}
define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
diff --git a/test/CodeGen/PowerPC/PR33636.ll b/test/CodeGen/PowerPC/PR33636.ll
new file mode 100644
index 0000000000000..4a1216dd4c113
--- /dev/null
+++ b/test/CodeGen/PowerPC/PR33636.ll
@@ -0,0 +1,702 @@
+; Just a test case for a crash reported in
+; https://bugs.llvm.org/show_bug.cgi?id=33636
+; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 < %s | FileCheck %s
+@g_225 = external unnamed_addr global i16, align 2
+@g_756 = external global [6 x i32], align 4
+@g_3456 = external global i32, align 4
+@g_3708 = external global [9 x i32], align 4
+@g_1252 = external global i8*, align 8
+@g_3043 = external global float*, align 8
+
+; Function Attrs: nounwind
+define void @main() {
+ br i1 undef, label %1, label %4
+
+; <label>:1: ; preds = %0
+ br i1 undef, label %2, label %3
+
+; <label>:2: ; preds = %1
+ br label %3
+
+; <label>:3: ; preds = %2, %1
+ br label %4
+
+; <label>:4: ; preds = %3, %0
+ br label %5
+
+; <label>:5: ; preds = %5, %4
+ br i1 undef, label %6, label %5
+
+; <label>:6: ; preds = %5
+ br i1 undef, label %7, label %8
+
+; <label>:7: ; preds = %6
+ br i1 undef, label %70, label %69
+
+; <label>:8: ; preds = %6
+ br i1 undef, label %9, label %50
+
+; <label>:9: ; preds = %8
+ br label %11
+
+; <label>:10: ; preds = %28
+ br i1 undef, label %11, label %12
+
+; <label>:11: ; preds = %10, %9
+ br label %13
+
+; <label>:12: ; preds = %10
+ br label %30
+
+; <label>:13: ; preds = %23, %11
+ br i1 undef, label %17, label %14
+
+; <label>:14: ; preds = %13
+ br i1 undef, label %16, label %15
+
+; <label>:15: ; preds = %14
+ br label %22
+
+; <label>:16: ; preds = %14
+ br label %17
+
+; <label>:17: ; preds = %16, %13
+ br i1 undef, label %18, label %19
+
+; <label>:18: ; preds = %17
+ br label %19
+
+; <label>:19: ; preds = %18, %17
+ br i1 undef, label %48, label %20
+
+; <label>:20: ; preds = %19
+ br i1 undef, label %48, label %21
+
+; <label>:21: ; preds = %20
+ br label %22
+
+; <label>:22: ; preds = %21, %15
+ br i1 undef, label %23, label %24
+
+; <label>:23: ; preds = %22
+ br label %13
+
+; <label>:24: ; preds = %22
+ br i1 undef, label %28, label %25
+
+; <label>:25: ; preds = %24
+ br label %26
+
+; <label>:26: ; preds = %26, %25
+ br i1 undef, label %26, label %27
+
+; <label>:27: ; preds = %26
+ br label %48
+
+; <label>:28: ; preds = %24
+ br i1 undef, label %29, label %10
+
+; <label>:29: ; preds = %28
+ br label %48
+
+; <label>:30: ; preds = %33, %12
+ br i1 undef, label %32, label %33
+
+; <label>:31: ; preds = %33
+ br label %34
+
+; <label>:32: ; preds = %30
+ br label %33
+
+; <label>:33: ; preds = %32, %30
+ br i1 undef, label %30, label %31
+
+; <label>:34: ; preds = %47, %31
+ br i1 undef, label %35, label %36
+
+; <label>:35: ; preds = %34
+ br label %36
+
+; <label>:36: ; preds = %35, %34
+ br label %37
+
+; <label>:37: ; preds = %45, %36
+ br i1 undef, label %40, label %38
+
+; <label>:38: ; preds = %37
+ br i1 undef, label %39, label %46
+
+; <label>:39: ; preds = %38
+ br label %41
+
+; <label>:40: ; preds = %37
+ br label %41
+
+; <label>:41: ; preds = %40, %39
+ br label %42
+
+; <label>:42: ; preds = %44, %41
+ br i1 undef, label %43, label %44
+
+; <label>:43: ; preds = %42
+ br label %44
+
+; <label>:44: ; preds = %43, %42
+ br i1 undef, label %42, label %45
+
+; <label>:45: ; preds = %44
+ br i1 undef, label %37, label %47
+
+; <label>:46: ; preds = %38
+ br label %48
+
+; <label>:47: ; preds = %45
+ br i1 undef, label %34, label %49
+
+; <label>:48: ; preds = %46, %29, %27, %20, %19
+ br label %65
+
+; <label>:49: ; preds = %47
+ br label %58
+
+; <label>:50: ; preds = %8
+ br i1 undef, label %52, label %51
+
+; <label>:51: ; preds = %50
+ br label %57
+
+; <label>:52: ; preds = %50
+ br label %53
+
+; <label>:53: ; preds = %56, %52
+ br i1 undef, label %54, label %59
+
+; <label>:54: ; preds = %53
+ br i1 undef, label %60, label %59
+
+; <label>:55: ; preds = %64
+ br label %56
+
+; <label>:56: ; preds = %64, %55
+ br i1 undef, label %57, label %53
+
+; <label>:57: ; preds = %56, %51
+ br label %58
+
+; <label>:58: ; preds = %57, %49
+ br label %65
+
+; <label>:59: ; preds = %63, %62, %61, %60, %54, %53
+ br label %65
+
+; <label>:60: ; preds = %54
+ br i1 undef, label %61, label %59
+
+; <label>:61: ; preds = %60
+ br i1 undef, label %62, label %59
+
+; <label>:62: ; preds = %61
+ br i1 undef, label %63, label %59
+
+; <label>:63: ; preds = %62
+ br i1 undef, label %64, label %59
+
+; <label>:64: ; preds = %63
+ br i1 undef, label %55, label %56
+
+; <label>:65: ; preds = %59, %58, %48
+ br i1 undef, label %66, label %67
+
+; <label>:66: ; preds = %65
+ br label %67
+
+; <label>:67: ; preds = %66, %65
+ br i1 undef, label %68, label %92
+
+; <label>:68: ; preds = %67
+ br label %92
+
+; <label>:69: ; preds = %7
+ br label %70
+
+; <label>:70: ; preds = %69, %7
+ br i1 undef, label %72, label %71
+
+; <label>:71: ; preds = %70
+ br label %72
+
+; <label>:72: ; preds = %71, %70
+ br i1 undef, label %73, label %74
+
+; <label>:73: ; preds = %72
+ br label %74
+
+; <label>:74: ; preds = %73, %72
+ br i1 undef, label %85, label %75
+
+; <label>:75: ; preds = %74
+ br i1 undef, label %84, label %76
+
+; <label>:76: ; preds = %75
+ br i1 undef, label %78, label %77
+
+; <label>:77: ; preds = %77, %76
+ br i1 undef, label %84, label %77
+
+; <label>:78: ; preds = %76
+ br label %79
+
+; <label>:79: ; preds = %83, %78
+ br i1 undef, label %83, label %80
+
+; <label>:80: ; preds = %79
+ br i1 undef, label %81, label %82
+
+; <label>:81: ; preds = %80
+ br label %83
+
+; <label>:82: ; preds = %80
+ br label %83
+
+; <label>:83: ; preds = %82, %81, %79
+ br i1 undef, label %90, label %79
+
+; <label>:84: ; preds = %77, %75
+ br label %92
+
+; <label>:85: ; preds = %74
+ br i1 undef, label %86, label %88
+
+; <label>:86: ; preds = %85
+ br i1 undef, label %89, label %87
+
+; <label>:87: ; preds = %86
+ br i1 undef, label %89, label %88
+
+; <label>:88: ; preds = %87, %85
+ br label %89
+
+; <label>:89: ; preds = %88, %87, %86
+ br label %92
+
+; <label>:90: ; preds = %83
+ br i1 undef, label %92, label %91
+
+; <label>:91: ; preds = %90
+ br label %92
+
+; <label>:92: ; preds = %91, %90, %89, %84, %68, %67
+ br label %93
+
+; <label>:93: ; preds = %100, %92
+ br label %94
+
+; <label>:94: ; preds = %98, %93
+ br label %95
+
+; <label>:95: ; preds = %97, %94
+ br i1 undef, label %96, label %97
+
+; <label>:96: ; preds = %95
+ br label %97
+
+; <label>:97: ; preds = %96, %95
+ br i1 undef, label %95, label %98
+
+; <label>:98: ; preds = %97
+ store i32 7, i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 7), align 4
+ %99 = load volatile i32, i32* @g_3456, align 4
+ br i1 undef, label %94, label %100
+
+; <label>:100: ; preds = %98
+ br i1 undef, label %93, label %101
+
+; <label>:101: ; preds = %100
+ br label %102
+
+; <label>:102: ; preds = %117, %101
+ br label %103
+
+; <label>:103: ; preds = %109, %102
+ store i8** @g_1252, i8*** undef, align 8
+ br i1 undef, label %105, label %104
+
+; <label>:104: ; preds = %103
+ br label %105
+
+; <label>:105: ; preds = %104, %103
+ %106 = icmp eq i32 0, 0
+ br i1 %106, label %107, label %116
+
+; <label>:107: ; preds = %105
+ br i1 icmp ne (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)), label %109, label %108
+
+; <label>:108: ; preds = %107
+ br label %109
+
+; <label>:109: ; preds = %108, %107
+ %110 = phi i32 [ sdiv (i32 32, i32 zext (i1 icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)) to i32)), %108 ], [ 32, %107 ]
+ %111 = trunc i32 %110 to i8
+ %112 = icmp ne i8 %111, 0
+ %113 = and i1 %112, icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4))
+ %114 = zext i1 %113 to i16
+ store i16 %114, i16* @g_225, align 2
+ %115 = load volatile float*, float** @g_3043, align 8
+ br i1 undef, label %103, label %117
+
+; <label>:116: ; preds = %105
+ br label %119
+
+; <label>:117: ; preds = %109
+ br i1 undef, label %102, label %118
+
+; <label>:118: ; preds = %117
+ br label %119
+
+; <label>:119: ; preds = %118, %116
+ br i1 undef, label %120, label %231
+
+; <label>:120: ; preds = %119
+ br label %232
+
+; <label>:121: ; preds = %230
+ br label %122
+
+; <label>:122: ; preds = %230, %121
+ br i1 undef, label %124, label %123
+
+; <label>:123: ; preds = %122
+ br label %124
+
+; <label>:124: ; preds = %123, %122
+ br i1 undef, label %228, label %225
+
+; <label>:125: ; preds = %218
+ br label %127
+
+; <label>:126: ; preds = %218
+ br label %127
+
+; <label>:127: ; preds = %216, %126, %125
+ br i1 undef, label %204, label %128
+
+; <label>:128: ; preds = %127
+ br label %205
+
+; <label>:129: ; preds = %216
+ br i1 undef, label %131, label %130
+
+; <label>:130: ; preds = %129
+ br label %131
+
+; <label>:131: ; preds = %130, %129
+ br i1 undef, label %133, label %132
+
+; <label>:132: ; preds = %131
+ br label %133
+
+; <label>:133: ; preds = %132, %131
+ br label %134
+
+; <label>:134: ; preds = %203, %133
+ br i1 undef, label %193, label %135
+
+; <label>:135: ; preds = %134
+ br label %194
+
+; <label>:136: ; preds = %203
+ br i1 undef, label %138, label %137
+
+; <label>:137: ; preds = %136
+ br label %138
+
+; <label>:138: ; preds = %137, %136
+ br i1 undef, label %192, label %139
+
+; <label>:139: ; preds = %138
+ br label %191
+
+; <label>:140: ; preds = %191, %190
+ br i1 undef, label %180, label %141
+
+; <label>:141: ; preds = %140
+ br label %181
+
+; <label>:142: ; preds = %190
+ br i1 undef, label %143, label %178
+
+; <label>:143: ; preds = %142
+ br label %179
+
+; <label>:144: ; preds = %179
+ br label %176
+
+; <label>:145: ; preds = %179
+ br label %176
+
+; <label>:146: ; preds = %177, %175, %174
+ br i1 undef, label %165, label %147
+
+; <label>:147: ; preds = %146
+ br label %166
+
+; <label>:148: ; preds = %174
+ br label %149
+
+; <label>:149: ; preds = %164, %148
+ br i1 undef, label %154, label %150
+
+; <label>:150: ; preds = %149
+ br label %155
+
+; <label>:151: ; preds = %164
+ br i1 undef, label %153, label %152
+
+; <label>:152: ; preds = %151
+ br label %153
+
+; <label>:153: ; preds = %152, %151
+ ret void
+
+; <label>:154: ; preds = %149
+ br label %155
+
+; <label>:155: ; preds = %154, %150
+ br i1 undef, label %157, label %156
+
+; <label>:156: ; preds = %155
+ br label %158
+
+; <label>:157: ; preds = %155
+ br label %158
+
+; <label>:158: ; preds = %157, %156
+ br i1 undef, label %160, label %159
+
+; <label>:159: ; preds = %158
+ br label %161
+
+; <label>:160: ; preds = %158
+ br label %161
+
+; <label>:161: ; preds = %160, %159
+ br i1 undef, label %163, label %162
+
+; <label>:162: ; preds = %161
+ br label %164
+
+; <label>:163: ; preds = %161
+ br label %164
+
+; <label>:164: ; preds = %163, %162
+ br i1 undef, label %151, label %149
+
+; <label>:165: ; preds = %146
+ br label %166
+
+; <label>:166: ; preds = %165, %147
+ br i1 undef, label %168, label %167
+
+; <label>:167: ; preds = %166
+ br label %169
+
+; <label>:168: ; preds = %166
+ br label %169
+
+; <label>:169: ; preds = %168, %167
+ br i1 undef, label %171, label %170
+
+; <label>:170: ; preds = %169
+ br label %172
+
+; <label>:171: ; preds = %169
+ br label %172
+
+; <label>:172: ; preds = %171, %170
+ br i1 undef, label %174, label %173
+
+; <label>:173: ; preds = %172
+ br label %174
+
+; <label>:174: ; preds = %173, %172
+ br i1 undef, label %148, label %146
+
+; <label>:175: ; preds = %176
+ br label %146
+
+; <label>:176: ; preds = %145, %144
+ br i1 undef, label %177, label %175
+
+; <label>:177: ; preds = %176
+ br label %146
+
+; <label>:178: ; preds = %142
+ br label %179
+
+; <label>:179: ; preds = %178, %143
+ br i1 undef, label %145, label %144
+
+; <label>:180: ; preds = %140
+ br label %181
+
+; <label>:181: ; preds = %180, %141
+ br i1 undef, label %183, label %182
+
+; <label>:182: ; preds = %181
+ br label %184
+
+; <label>:183: ; preds = %181
+ br label %184
+
+; <label>:184: ; preds = %183, %182
+ br i1 undef, label %186, label %185
+
+; <label>:185: ; preds = %184
+ br label %187
+
+; <label>:186: ; preds = %184
+ br label %187
+
+; <label>:187: ; preds = %186, %185
+ br i1 undef, label %189, label %188
+
+; <label>:188: ; preds = %187
+ br label %190
+
+; <label>:189: ; preds = %187
+ br label %190
+
+; <label>:190: ; preds = %189, %188
+ br i1 undef, label %142, label %140
+
+; <label>:191: ; preds = %192, %139
+ br label %140
+
+; <label>:192: ; preds = %138
+ br label %191
+
+; <label>:193: ; preds = %134
+ br label %194
+
+; <label>:194: ; preds = %193, %135
+ br i1 undef, label %196, label %195
+
+; <label>:195: ; preds = %194
+ br label %197
+
+; <label>:196: ; preds = %194
+ br label %197
+
+; <label>:197: ; preds = %196, %195
+ br i1 undef, label %199, label %198
+
+; <label>:198: ; preds = %197
+ br label %200
+
+; <label>:199: ; preds = %197
+ br label %200
+
+; <label>:200: ; preds = %199, %198
+ br i1 undef, label %202, label %201
+
+; <label>:201: ; preds = %200
+ br label %203
+
+; <label>:202: ; preds = %200
+ br label %203
+
+; <label>:203: ; preds = %202, %201
+ br i1 undef, label %136, label %134
+
+; <label>:204: ; preds = %127
+ br label %205
+
+; <label>:205: ; preds = %204, %128
+ br i1 undef, label %207, label %206
+
+; <label>:206: ; preds = %205
+ br label %208
+
+; <label>:207: ; preds = %205
+ br label %208
+
+; <label>:208: ; preds = %207, %206
+ br i1 undef, label %210, label %209
+
+; <label>:209: ; preds = %208
+ br label %211
+
+; <label>:210: ; preds = %208
+ br label %211
+
+; <label>:211: ; preds = %210, %209
+ br i1 undef, label %213, label %212
+
+; <label>:212: ; preds = %211
+ br label %214
+
+; <label>:213: ; preds = %211
+ br label %214
+
+; <label>:214: ; preds = %213, %212
+ br i1 undef, label %216, label %215
+
+; <label>:215: ; preds = %214
+ br label %216
+
+; <label>:216: ; preds = %215, %214
+ br i1 undef, label %129, label %127
+
+; <label>:217: ; preds = %220
+ br label %218
+
+; <label>:218: ; preds = %221, %217
+ br i1 undef, label %126, label %125
+
+; <label>:219: ; preds = %223
+ br label %220
+
+; <label>:220: ; preds = %224, %219
+ br i1 undef, label %221, label %217
+
+; <label>:221: ; preds = %220
+ br label %218
+
+; <label>:222: ; preds = %226
+ br label %223
+
+; <label>:223: ; preds = %227, %222
+ br i1 undef, label %224, label %219
+
+; <label>:224: ; preds = %223
+ br label %220
+
+; <label>:225: ; preds = %124
+ br label %226
+
+; <label>:226: ; preds = %228, %225
+ br i1 undef, label %227, label %222
+
+; <label>:227: ; preds = %226
+ br label %223
+
+; <label>:228: ; preds = %124
+ br label %226
+
+; <label>:229: ; preds = %232
+ br label %230
+
+; <label>:230: ; preds = %233, %229
+ br i1 undef, label %122, label %121
+
+; <label>:231: ; preds = %119
+ br label %232
+
+; <label>:232: ; preds = %231, %120
+ br i1 undef, label %233, label %229
+
+; <label>:233: ; preds = %232
+ br label %230
+
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index d57b3a203791c..0c7a31d16b199 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -370,7 +370,7 @@ define void @test36() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread acquire
+ fence syncscope("singlethread") acquire
ret void
}
@@ -379,7 +379,7 @@ define void @test37() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread release
+ fence syncscope("singlethread") release
ret void
}
@@ -388,7 +388,7 @@ define void @test38() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread acq_rel
+ fence syncscope("singlethread") acq_rel
ret void
}
@@ -397,7 +397,7 @@ define void @test39() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: blr
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
@@ -1273,7 +1273,7 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread monotonic monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1294,7 +1294,7 @@ define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1315,7 +1315,7 @@ define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1336,7 +1336,7 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1357,7 +1357,7 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
ret void
}
@@ -1379,7 +1379,7 @@ define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1401,7 +1401,7 @@ define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1423,7 +1423,7 @@ define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1445,7 +1445,7 @@ define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1467,7 +1467,7 @@ define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1487,7 +1487,7 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread monotonic monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1508,7 +1508,7 @@ define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1529,7 +1529,7 @@ define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1550,7 +1550,7 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1571,7 +1571,7 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
ret void
}
@@ -1593,7 +1593,7 @@ define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1615,7 +1615,7 @@ define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1637,7 +1637,7 @@ define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1659,7 +1659,7 @@ define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1681,7 +1681,7 @@ define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1701,7 +1701,7 @@ define void @test100(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread monotonic monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1722,7 +1722,7 @@ define void @test101(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1743,7 +1743,7 @@ define void @test102(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1764,7 +1764,7 @@ define void @test103(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1785,7 +1785,7 @@ define void @test104(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
ret void
}
@@ -1807,7 +1807,7 @@ define void @test105(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1829,7 +1829,7 @@ define void @test106(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1851,7 +1851,7 @@ define void @test107(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1873,7 +1873,7 @@ define void @test108(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1895,7 +1895,7 @@ define void @test109(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1915,7 +1915,7 @@ define void @test110(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread monotonic monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1936,7 +1936,7 @@ define void @test111(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1957,7 +1957,7 @@ define void @test112(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1978,7 +1978,7 @@ define void @test113(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1999,7 +1999,7 @@ define void @test114(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
ret void
}
@@ -2021,7 +2021,7 @@ define void @test115(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -2043,7 +2043,7 @@ define void @test116(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -2065,7 +2065,7 @@ define void @test117(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -2087,7 +2087,7 @@ define void @test118(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -2109,7 +2109,7 @@ define void @test119(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -5847,7 +5847,7 @@ define i8 @test340(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -5862,7 +5862,7 @@ define i8 @test341(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -5877,7 +5877,7 @@ define i8 @test342(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -5893,7 +5893,7 @@ define i8 @test343(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -5909,7 +5909,7 @@ define i8 @test344(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -5923,7 +5923,7 @@ define i16 @test345(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -5938,7 +5938,7 @@ define i16 @test346(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -5953,7 +5953,7 @@ define i16 @test347(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -5969,7 +5969,7 @@ define i16 @test348(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -5985,7 +5985,7 @@ define i16 @test349(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -5999,7 +5999,7 @@ define i32 @test350(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6014,7 +6014,7 @@ define i32 @test351(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6029,7 +6029,7 @@ define i32 @test352(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6045,7 +6045,7 @@ define i32 @test353(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6061,7 +6061,7 @@ define i32 @test354(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6075,7 +6075,7 @@ define i64 @test355(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6090,7 +6090,7 @@ define i64 @test356(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6105,7 +6105,7 @@ define i64 @test357(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6121,7 +6121,7 @@ define i64 @test358(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6137,7 +6137,7 @@ define i64 @test359(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6152,7 +6152,7 @@ define i8 @test360(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6168,7 +6168,7 @@ define i8 @test361(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6184,7 +6184,7 @@ define i8 @test362(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6201,7 +6201,7 @@ define i8 @test363(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6218,7 +6218,7 @@ define i8 @test364(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6233,7 +6233,7 @@ define i16 @test365(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6249,7 +6249,7 @@ define i16 @test366(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6265,7 +6265,7 @@ define i16 @test367(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6282,7 +6282,7 @@ define i16 @test368(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6299,7 +6299,7 @@ define i16 @test369(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6314,7 +6314,7 @@ define i32 @test370(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6330,7 +6330,7 @@ define i32 @test371(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6346,7 +6346,7 @@ define i32 @test372(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6363,7 +6363,7 @@ define i32 @test373(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6380,7 +6380,7 @@ define i32 @test374(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6395,7 +6395,7 @@ define i64 @test375(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6411,7 +6411,7 @@ define i64 @test376(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6427,7 +6427,7 @@ define i64 @test377(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6444,7 +6444,7 @@ define i64 @test378(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6461,7 +6461,7 @@ define i64 @test379(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6476,7 +6476,7 @@ define i8 @test380(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6492,7 +6492,7 @@ define i8 @test381(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6508,7 +6508,7 @@ define i8 @test382(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6525,7 +6525,7 @@ define i8 @test383(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6542,7 +6542,7 @@ define i8 @test384(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6557,7 +6557,7 @@ define i16 @test385(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6573,7 +6573,7 @@ define i16 @test386(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6589,7 +6589,7 @@ define i16 @test387(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6606,7 +6606,7 @@ define i16 @test388(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6623,7 +6623,7 @@ define i16 @test389(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6638,7 +6638,7 @@ define i32 @test390(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6654,7 +6654,7 @@ define i32 @test391(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6670,7 +6670,7 @@ define i32 @test392(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6687,7 +6687,7 @@ define i32 @test393(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6704,7 +6704,7 @@ define i32 @test394(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6719,7 +6719,7 @@ define i64 @test395(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6735,7 +6735,7 @@ define i64 @test396(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6751,7 +6751,7 @@ define i64 @test397(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6768,7 +6768,7 @@ define i64 @test398(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6785,7 +6785,7 @@ define i64 @test399(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6800,7 +6800,7 @@ define i8 @test400(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6816,7 +6816,7 @@ define i8 @test401(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6832,7 +6832,7 @@ define i8 @test402(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6849,7 +6849,7 @@ define i8 @test403(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6866,7 +6866,7 @@ define i8 @test404(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6881,7 +6881,7 @@ define i16 @test405(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6897,7 +6897,7 @@ define i16 @test406(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6913,7 +6913,7 @@ define i16 @test407(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6930,7 +6930,7 @@ define i16 @test408(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6947,7 +6947,7 @@ define i16 @test409(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6962,7 +6962,7 @@ define i32 @test410(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6978,7 +6978,7 @@ define i32 @test411(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6994,7 +6994,7 @@ define i32 @test412(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7011,7 +7011,7 @@ define i32 @test413(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7028,7 +7028,7 @@ define i32 @test414(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7043,7 +7043,7 @@ define i64 @test415(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7059,7 +7059,7 @@ define i64 @test416(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7075,7 +7075,7 @@ define i64 @test417(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7092,7 +7092,7 @@ define i64 @test418(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7109,7 +7109,7 @@ define i64 @test419(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7124,7 +7124,7 @@ define i8 @test420(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7140,7 +7140,7 @@ define i8 @test421(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7156,7 +7156,7 @@ define i8 @test422(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7173,7 +7173,7 @@ define i8 @test423(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7190,7 +7190,7 @@ define i8 @test424(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7205,7 +7205,7 @@ define i16 @test425(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7221,7 +7221,7 @@ define i16 @test426(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7237,7 +7237,7 @@ define i16 @test427(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7254,7 +7254,7 @@ define i16 @test428(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7271,7 +7271,7 @@ define i16 @test429(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7286,7 +7286,7 @@ define i32 @test430(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7302,7 +7302,7 @@ define i32 @test431(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7318,7 +7318,7 @@ define i32 @test432(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7335,7 +7335,7 @@ define i32 @test433(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7352,7 +7352,7 @@ define i32 @test434(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7367,7 +7367,7 @@ define i64 @test435(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7383,7 +7383,7 @@ define i64 @test436(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7399,7 +7399,7 @@ define i64 @test437(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7416,7 +7416,7 @@ define i64 @test438(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7433,7 +7433,7 @@ define i64 @test439(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7448,7 +7448,7 @@ define i8 @test440(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7464,7 +7464,7 @@ define i8 @test441(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7480,7 +7480,7 @@ define i8 @test442(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7497,7 +7497,7 @@ define i8 @test443(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7514,7 +7514,7 @@ define i8 @test444(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7529,7 +7529,7 @@ define i16 @test445(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7545,7 +7545,7 @@ define i16 @test446(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7561,7 +7561,7 @@ define i16 @test447(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7578,7 +7578,7 @@ define i16 @test448(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7595,7 +7595,7 @@ define i16 @test449(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7610,7 +7610,7 @@ define i32 @test450(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7626,7 +7626,7 @@ define i32 @test451(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7642,7 +7642,7 @@ define i32 @test452(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7659,7 +7659,7 @@ define i32 @test453(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7676,7 +7676,7 @@ define i32 @test454(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7691,7 +7691,7 @@ define i64 @test455(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7707,7 +7707,7 @@ define i64 @test456(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7723,7 +7723,7 @@ define i64 @test457(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7740,7 +7740,7 @@ define i64 @test458(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7757,7 +7757,7 @@ define i64 @test459(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7772,7 +7772,7 @@ define i8 @test460(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7788,7 +7788,7 @@ define i8 @test461(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7804,7 +7804,7 @@ define i8 @test462(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7821,7 +7821,7 @@ define i8 @test463(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7838,7 +7838,7 @@ define i8 @test464(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7853,7 +7853,7 @@ define i16 @test465(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7869,7 +7869,7 @@ define i16 @test466(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7885,7 +7885,7 @@ define i16 @test467(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7902,7 +7902,7 @@ define i16 @test468(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7919,7 +7919,7 @@ define i16 @test469(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7934,7 +7934,7 @@ define i32 @test470(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7950,7 +7950,7 @@ define i32 @test471(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7966,7 +7966,7 @@ define i32 @test472(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7983,7 +7983,7 @@ define i32 @test473(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8000,7 +8000,7 @@ define i32 @test474(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8015,7 +8015,7 @@ define i64 @test475(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8031,7 +8031,7 @@ define i64 @test476(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8047,7 +8047,7 @@ define i64 @test477(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8064,7 +8064,7 @@ define i64 @test478(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8081,7 +8081,7 @@ define i64 @test479(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8099,7 +8099,7 @@ define i8 @test480(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB480_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8118,7 +8118,7 @@ define i8 @test481(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB481_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8137,7 +8137,7 @@ define i8 @test482(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB482_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8157,7 +8157,7 @@ define i8 @test483(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8177,7 +8177,7 @@ define i8 @test484(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8195,7 +8195,7 @@ define i16 @test485(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB485_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8214,7 +8214,7 @@ define i16 @test486(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB486_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8233,7 +8233,7 @@ define i16 @test487(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB487_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8253,7 +8253,7 @@ define i16 @test488(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -8273,7 +8273,7 @@ define i16 @test489(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -8290,7 +8290,7 @@ define i32 @test490(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB490_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -8308,7 +8308,7 @@ define i32 @test491(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB491_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -8326,7 +8326,7 @@ define i32 @test492(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB492_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -8345,7 +8345,7 @@ define i32 @test493(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8364,7 +8364,7 @@ define i32 @test494(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8381,7 +8381,7 @@ define i64 @test495(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB495_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8399,7 +8399,7 @@ define i64 @test496(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB496_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8417,7 +8417,7 @@ define i64 @test497(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB497_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8436,7 +8436,7 @@ define i64 @test498(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8455,7 +8455,7 @@ define i64 @test499(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8473,7 +8473,7 @@ define i8 @test500(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB500_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8492,7 +8492,7 @@ define i8 @test501(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB501_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8511,7 +8511,7 @@ define i8 @test502(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB502_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8531,7 +8531,7 @@ define i8 @test503(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8551,7 +8551,7 @@ define i8 @test504(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8569,7 +8569,7 @@ define i16 @test505(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB505_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8588,7 +8588,7 @@ define i16 @test506(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB506_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8607,7 +8607,7 @@ define i16 @test507(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB507_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8627,7 +8627,7 @@ define i16 @test508(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -8647,7 +8647,7 @@ define i16 @test509(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -8664,7 +8664,7 @@ define i32 @test510(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB510_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -8682,7 +8682,7 @@ define i32 @test511(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB511_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -8700,7 +8700,7 @@ define i32 @test512(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB512_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -8719,7 +8719,7 @@ define i32 @test513(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8738,7 +8738,7 @@ define i32 @test514(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8755,7 +8755,7 @@ define i64 @test515(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB515_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8773,7 +8773,7 @@ define i64 @test516(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB516_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8791,7 +8791,7 @@ define i64 @test517(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB517_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8810,7 +8810,7 @@ define i64 @test518(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8829,7 +8829,7 @@ define i64 @test519(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8846,7 +8846,7 @@ define i8 @test520(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB520_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8864,7 +8864,7 @@ define i8 @test521(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB521_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8882,7 +8882,7 @@ define i8 @test522(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB522_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8901,7 +8901,7 @@ define i8 @test523(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8920,7 +8920,7 @@ define i8 @test524(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8937,7 +8937,7 @@ define i16 @test525(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB525_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8955,7 +8955,7 @@ define i16 @test526(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB526_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8973,7 +8973,7 @@ define i16 @test527(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB527_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8992,7 +8992,7 @@ define i16 @test528(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -9011,7 +9011,7 @@ define i16 @test529(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -9028,7 +9028,7 @@ define i32 @test530(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB530_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -9046,7 +9046,7 @@ define i32 @test531(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB531_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -9064,7 +9064,7 @@ define i32 @test532(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB532_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -9083,7 +9083,7 @@ define i32 @test533(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -9102,7 +9102,7 @@ define i32 @test534(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -9119,7 +9119,7 @@ define i64 @test535(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB535_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -9137,7 +9137,7 @@ define i64 @test536(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB536_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -9155,7 +9155,7 @@ define i64 @test537(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB537_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -9174,7 +9174,7 @@ define i64 @test538(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -9193,7 +9193,7 @@ define i64 @test539(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -9210,7 +9210,7 @@ define i8 @test540(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB540_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -9228,7 +9228,7 @@ define i8 @test541(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB541_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -9246,7 +9246,7 @@ define i8 @test542(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB542_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -9265,7 +9265,7 @@ define i8 @test543(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -9284,7 +9284,7 @@ define i8 @test544(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -9301,7 +9301,7 @@ define i16 @test545(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB545_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -9319,7 +9319,7 @@ define i16 @test546(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB546_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -9337,7 +9337,7 @@ define i16 @test547(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB547_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -9356,7 +9356,7 @@ define i16 @test548(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -9375,7 +9375,7 @@ define i16 @test549(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -9392,7 +9392,7 @@ define i32 @test550(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB550_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -9410,7 +9410,7 @@ define i32 @test551(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB551_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -9428,7 +9428,7 @@ define i32 @test552(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB552_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -9447,7 +9447,7 @@ define i32 @test553(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -9466,7 +9466,7 @@ define i32 @test554(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -9483,7 +9483,7 @@ define i64 @test555(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB555_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -9501,7 +9501,7 @@ define i64 @test556(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB556_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -9519,7 +9519,7 @@ define i64 @test557(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB557_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -9538,7 +9538,7 @@ define i64 @test558(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -9557,7 +9557,7 @@ define i64 @test559(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
diff --git a/test/CodeGen/PowerPC/bitreverse.ll b/test/CodeGen/PowerPC/bitreverse.ll
deleted file mode 100644
index dca7340d035d6..0000000000000
--- a/test/CodeGen/PowerPC/bitreverse.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=ppc64 %s -o - | FileCheck %s
-
-; These tests just check that the plumbing is in place for @llvm.bitreverse. The
-; actual output is massive at the moment as llvm.bitreverse is not yet legal.
-
-declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
-
-define <2 x i16> @f(<2 x i16> %a) {
-; CHECK-LABEL: f:
-; CHECK: rlwinm
- %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
- ret <2 x i16> %b
-}
-
-declare i8 @llvm.bitreverse.i8(i8) readnone
-
-define i8 @g(i8 %a) {
-; CHECK-LABEL: g:
-; CHECK: rlwinm
-; CHECK: rlwimi
- %b = call i8 @llvm.bitreverse.i8(i8 %a)
- ret i8 %b
-}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index c42f677d17ab1..60bec4d18f12e 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1028,7 +1028,7 @@ entry:
; P9LE: vperm
; P9LE: blr
; P8BE: sldi {{r[0-9]+}}, r4, 2
-; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3,
+; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3
; P8BE-DAG: lxvw4x
; P8BE: vperm
; P8BE: blr
@@ -2187,7 +2187,7 @@ entry:
; P9LE: vperm
; P9LE: blr
; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3
+; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3
; P8BE-DAG: lxvw4x
; P8BE: vperm
; P8BE: blr
diff --git a/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll
new file mode 100644
index 0000000000000..71755f722cb2c
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @limit_loop(i32 signext %iters, i32* nocapture readonly %vec, i32 signext %limit) local_unnamed_addr {
+entry:
+ %cmp5 = icmp sgt i32 %iters, 0
+ br i1 %cmp5, label %for.body.preheader, label %cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = sext i32 %iters to i64
+ br label %for.body
+
+for.cond: ; preds = %for.body
+ %cmp = icmp slt i64 %indvars.iv.next, %0
+ br i1 %cmp, label %for.body, label %cleanup
+
+for.body: ; preds = %for.body.preheader, %for.cond
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.cond ]
+ %arrayidx = getelementptr inbounds i32, i32* %vec, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp1 = icmp slt i32 %1, %limit
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ br i1 %cmp1, label %for.cond, label %cleanup
+
+cleanup: ; preds = %for.body, %for.cond, %entry
+ %2 = phi i32 [ 0, %entry ], [ 0, %for.cond ], [ 1, %for.body ]
+ ret i32 %2
+; CHECK-LABEL: limit_loop
+; CHECK: mtctr
+; CHECK-NOT: addi {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK: bdnz
+; CHECK: blr
+}
+
+
diff --git a/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll
new file mode 100644
index 0000000000000..87b45beeab7e0
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll
@@ -0,0 +1,32 @@
+; Note the formula for negative number alignment calculation should be y = x & ~(n-1) rather than y = (x + (n-1)) & ~(n-1).
+; after patch https://reviews.llvm.org/D34337, we could save 16 bytes in the best case.
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE
+
+define signext i32 @bar(i32 signext %ii) {
+entry:
+ %0 = tail call i32 asm sideeffect "add $0, $1, $2\0A", "=r,r,r,~{f14},~{r15},~{v20}"(i32 %ii, i32 10)
+ ret i32 %0
+; Before the fix by patch D34337:
+; stdu 1, -544(1)
+; std 15, 264(1)
+; stfd 14, 400(1)
+; stdu 1, -560(1)
+; std 15, 280(1)
+; stfd 14, 416(1)
+
+; After the fix by patch D34337:
+; CHECK-LE: stdu 1, -528(1)
+; CHECK-LE:std 15, 248(1)
+; CHECK-LE:stfd 14, 384(1)
+; CHECK-BE: stdu 1, -544(1)
+; CHECK-BE:std 15, 264(1)
+; CHECK-BE:stfd 14, 400(1)
+}
+
+define signext i32 @foo() {
+entry:
+ %call = tail call signext i32 @bar(i32 signext 5)
+ ret i32 %call
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
index 0e871c3588691..3a425406d043f 100644
--- a/test/CodeGen/PowerPC/ppc64le-smallarg.ll
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@@ -53,8 +53,8 @@ entry:
ret void
}
; CHECK: @caller2
-; CHECK: li [[TOCOFF:[0-9]+]], 136
-; CHECK: stxsspx {{[0-9]+}}, 1, [[TOCOFF]]
+; CHECK: addi [[TOCOFF:[0-9]+]], {{[0-9]+}}, 136
+; CHECK: stxsspx {{[0-9]+}}, 0, [[TOCOFF]]
; CHECK: bl test2
declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
diff --git a/test/CodeGen/PowerPC/pr33093.ll b/test/CodeGen/PowerPC/pr33093.ll
new file mode 100644
index 0000000000000..5212973f8317c
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr33093.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+define zeroext i32 @ReverseBits(i32 zeroext %n) {
+; CHECK-LABEL: ReverseBits:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: slwi 6, 3, 1
+; CHECK-NEXT: srwi 3, 3, 1
+; CHECK-NEXT: lis 7, -13108
+; CHECK-NEXT: lis 8, 13107
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: lis 10, -3856
+; CHECK-NEXT: lis 11, 3855
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: ori 5, 8, 13107
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 7, 52428
+; CHECK-NEXT: slwi 9, 3, 2
+; CHECK-NEXT: srwi 3, 3, 2
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 9, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: slwi 12, 3, 4
+; CHECK-NEXT: srwi 3, 3, 4
+; CHECK-NEXT: and 4, 12, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rotlwi 4, 3, 24
+; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31
+; CHECK-NEXT: rldicl 3, 4, 0, 32
+; CHECK-NEXT: clrldi 3, 3, 32
+; CHECK-NEXT: blr
+entry:
+ %shr = lshr i32 %n, 1
+ %and = and i32 %shr, 1431655765
+ %and1 = shl i32 %n, 1
+ %shl = and i32 %and1, -1431655766
+ %or = or i32 %and, %shl
+ %shr2 = lshr i32 %or, 2
+ %and3 = and i32 %shr2, 858993459
+ %and4 = shl i32 %or, 2
+ %shl5 = and i32 %and4, -858993460
+ %or6 = or i32 %and3, %shl5
+ %shr7 = lshr i32 %or6, 4
+ %and8 = and i32 %shr7, 252645135
+ %and9 = shl i32 %or6, 4
+ %shl10 = and i32 %and9, -252645136
+ %or11 = or i32 %and8, %shl10
+ %shr13 = lshr i32 %or11, 24
+ %and14 = lshr i32 %or11, 8
+ %shr15 = and i32 %and14, 65280
+ %and17 = shl i32 %or11, 8
+ %shl18 = and i32 %and17, 16711680
+ %shl21 = shl i32 %or11, 24
+ %or16 = or i32 %shl21, %shr13
+ %or19 = or i32 %or16, %shr15
+ %or22 = or i32 %or19, %shl18
+ ret i32 %or22
+}
+
+define i64 @ReverseBits64(i64 %n) {
+; CHECK-LABEL: ReverseBits64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: lis 6, -13108
+; CHECK-NEXT: lis 7, 13107
+; CHECK-NEXT: sldi 8, 3, 1
+; CHECK-NEXT: rldicl 3, 3, 63, 1
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: ori 6, 6, 52428
+; CHECK-NEXT: ori 7, 7, 13107
+; CHECK-NEXT: sldi 4, 4, 32
+; CHECK-NEXT: sldi 5, 5, 32
+; CHECK-NEXT: oris 4, 4, 43690
+; CHECK-NEXT: oris 5, 5, 21845
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: lis 7, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 12, 5, 52428
+; CHECK-NEXT: oris 9, 6, 13107
+; CHECK-NEXT: lis 6, -3856
+; CHECK-NEXT: ori 7, 7, 3855
+; CHECK-NEXT: sldi 8, 3, 2
+; CHECK-NEXT: ori 4, 12, 52428
+; CHECK-NEXT: rldicl 3, 3, 62, 2
+; CHECK-NEXT: ori 5, 9, 13107
+; CHECK-NEXT: ori 6, 6, 61680
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 10, 5, 61680
+; CHECK-NEXT: oris 11, 6, 3855
+; CHECK-NEXT: sldi 6, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: rldicl 3, 3, 60, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rldicl 4, 3, 32, 32
+; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31
+; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31
+; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15
+; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31
+; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31
+; CHECK-NEXT: sldi 12, 5, 32
+; CHECK-NEXT: or 3, 12, 6
+; CHECK-NEXT: blr
+entry:
+ %shr = lshr i64 %n, 1
+ %and = and i64 %shr, 6148914691236517205
+ %and1 = shl i64 %n, 1
+ %shl = and i64 %and1, -6148914691236517206
+ %or = or i64 %and, %shl
+ %shr2 = lshr i64 %or, 2
+ %and3 = and i64 %shr2, 3689348814741910323
+ %and4 = shl i64 %or, 2
+ %shl5 = and i64 %and4, -3689348814741910324
+ %or6 = or i64 %and3, %shl5
+ %shr7 = lshr i64 %or6, 4
+ %and8 = and i64 %shr7, 1085102592571150095
+ %and9 = shl i64 %or6, 4
+ %shl10 = and i64 %and9, -1085102592571150096
+ %or11 = or i64 %and8, %shl10
+ %shr13 = lshr i64 %or11, 56
+ %and14 = lshr i64 %or11, 40
+ %shr15 = and i64 %and14, 65280
+ %and17 = lshr i64 %or11, 24
+ %shr18 = and i64 %and17, 16711680
+ %and20 = lshr i64 %or11, 8
+ %shr21 = and i64 %and20, 4278190080
+ %and23 = shl i64 %or11, 8
+ %shl24 = and i64 %and23, 1095216660480
+ %and26 = shl i64 %or11, 24
+ %shl27 = and i64 %and26, 280375465082880
+ %and29 = shl i64 %or11, 40
+ %shl30 = and i64 %and29, 71776119061217280
+ %shl33 = shl i64 %or11, 56
+ %or16 = or i64 %shl33, %shr13
+ %or19 = or i64 %or16, %shr15
+ %or22 = or i64 %or19, %shr18
+ %or25 = or i64 %or22, %shr21
+ %or28 = or i64 %or25, %shl24
+ %or31 = or i64 %or28, %shl27
+ %or34 = or i64 %or31, %shl30
+ ret i64 %or34
+}
diff --git a/test/CodeGen/PowerPC/select-addrRegRegOnly.ll b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll
new file mode 100644
index 0000000000000..f880d1faf9d90
--- /dev/null
+++ b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown -verify-machineinstrs < %s | FileCheck %s
+
+; Function Attrs: norecurse nounwind readonly
+define float @testSingleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 {
+; CHECK-LABEL: testSingleAccess:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addi 3, 3, 8
+; CHECK-NEXT: lxsiwax 0, 0, 3
+; CHECK-NEXT: xscvsxdsp 1, 0
+; CHECK-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2
+ %0 = load i32, i32* %arrayidx, align 4
+ %conv = sitofp i32 %0 to float
+ ret float %conv
+}
+
+; Function Attrs: norecurse nounwind readonly
+define float @testMultipleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 {
+; CHECK-LABEL: testMultipleAccess:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lwz 4, 8(3)
+; CHECK-NEXT: lwz 12, 12(3)
+; CHECK-NEXT: add 3, 12, 4
+; CHECK-NEXT: mtvsrwa 0, 3
+; CHECK-NEXT: xscvsxdsp 1, 0
+; CHECK-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 3
+ %1 = load i32, i32* %arrayidx1, align 4
+ %add = add nsw i32 %1, %0
+ %conv = sitofp i32 %add to float
+ ret float %conv
+}
diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll
index 7bb6cc180c962..26c4410ded6d2 100644
--- a/test/CodeGen/PowerPC/svr4-redzone.ll
+++ b/test/CodeGen/PowerPC/svr4-redzone.ll
@@ -29,11 +29,11 @@ entry:
define i8* @bigstack() nounwind {
entry:
- %0 = alloca i8, i32 230
+ %0 = alloca i8, i32 290
ret i8* %0
}
; PPC32-LABEL: bigstack:
-; PPC32: stwu 1, -240(1)
+; PPC32: stwu 1, -304(1)
; PPC64-LABEL: bigstack:
-; PPC64: stdu 1, -288(1)
+; PPC64: stdu 1, -352(1)
diff --git a/test/CodeGen/PowerPC/tailcall1-64.ll b/test/CodeGen/PowerPC/tailcall1-64.ll
index 3dc2672556eaf..58ab0bce309c2 100644
--- a/test/CodeGen/PowerPC/tailcall1-64.ll
+++ b/test/CodeGen/PowerPC/tailcall1-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -relocation-model=static -verify-machineinstrs < %s -march=ppc64 -tailcallopt | grep TC_RETURNd8
+; RUN: llc -relocation-model=static -verify-machineinstrs < %s -mtriple=ppc64-- -tailcallopt | grep TC_RETURNd8
+; RUN: llc -relocation-model=static -verify-machineinstrs -mtriple=ppc64-- < %s | FileCheck %s
define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
entry:
ret i32 %a3
@@ -6,6 +7,8 @@ entry:
define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
entry:
- %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; <i32> [#uses=1]
+ %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )
ret i32 %tmp11
+; CHECK-LABEL: tailcaller
+; CHECK-NOT: stdu
}
diff --git a/test/CodeGen/PowerPC/testBitReverse.ll b/test/CodeGen/PowerPC/testBitReverse.ll
new file mode 100644
index 0000000000000..6993d17ad8f34
--- /dev/null
+++ b/test/CodeGen/PowerPC/testBitReverse.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+declare i32 @llvm.bitreverse.i32(i32)
+define i32 @testBitReverseIntrinsicI32(i32 %arg) {
+; CHECK-LABEL: testBitReverseIntrinsicI32:
+; CHECK: # BB#0:
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: slwi 6, 3, 1
+; CHECK-NEXT: srwi 3, 3, 1
+; CHECK-NEXT: lis 7, -13108
+; CHECK-NEXT: lis 8, 13107
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: lis 10, -3856
+; CHECK-NEXT: lis 11, 3855
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: ori 5, 8, 13107
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 7, 52428
+; CHECK-NEXT: slwi 9, 3, 2
+; CHECK-NEXT: srwi 3, 3, 2
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 9, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: slwi 12, 3, 4
+; CHECK-NEXT: srwi 3, 3, 4
+; CHECK-NEXT: and 4, 12, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rotlwi 4, 3, 24
+; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31
+; CHECK-NEXT: rldicl 3, 4, 0, 32
+; CHECK-NEXT: blr
+ %res = call i32 @llvm.bitreverse.i32(i32 %arg)
+ ret i32 %res
+}
+
+declare i64 @llvm.bitreverse.i64(i64)
+define i64 @testBitReverseIntrinsicI64(i64 %arg) {
+; CHECK-LABEL: testBitReverseIntrinsicI64:
+; CHECK: # BB#0:
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: lis 6, -13108
+; CHECK-NEXT: lis 7, 13107
+; CHECK-NEXT: sldi 8, 3, 1
+; CHECK-NEXT: rldicl 3, 3, 63, 1
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: ori 6, 6, 52428
+; CHECK-NEXT: ori 7, 7, 13107
+; CHECK-NEXT: sldi 4, 4, 32
+; CHECK-NEXT: sldi 5, 5, 32
+; CHECK-NEXT: oris 4, 4, 43690
+; CHECK-NEXT: oris 5, 5, 21845
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: lis 7, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 12, 5, 52428
+; CHECK-NEXT: oris 9, 6, 13107
+; CHECK-NEXT: lis 6, -3856
+; CHECK-NEXT: ori 7, 7, 3855
+; CHECK-NEXT: sldi 8, 3, 2
+; CHECK-NEXT: ori 4, 12, 52428
+; CHECK-NEXT: rldicl 3, 3, 62, 2
+; CHECK-NEXT: ori 5, 9, 13107
+; CHECK-NEXT: ori 6, 6, 61680
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 10, 5, 61680
+; CHECK-NEXT: oris 11, 6, 3855
+; CHECK-NEXT: sldi 6, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: rldicl 3, 3, 60, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rldicl 4, 3, 32, 32
+; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31
+; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31
+; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15
+; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31
+; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31
+; CHECK-NEXT: sldi 12, 5, 32
+; CHECK-NEXT: or 3, 12, 6
+; CHECK-NEXT: blr
+ %res = call i64 @llvm.bitreverse.i64(i64 %arg)
+ ret i64 %res
+}
diff --git a/test/CodeGen/PowerPC/vec_extract_p9.ll b/test/CodeGen/PowerPC/vec_extract_p9.ll
new file mode 100644
index 0000000000000..241209a0e6b75
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extract_p9.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE
+
+define zeroext i8 @test1(<16 x i8> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test1:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextubrx 3, 5, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 56
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test1:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextublx 3, 5, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 56
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 %index
+ ret i8 %vecext
+}
+
+define signext i8 @test2(<16 x i8> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test2:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextubrx 3, 5, 2
+; CHECK-LE-NEXT: extsb 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test2:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextublx 3, 5, 2
+; CHECK-BE-NEXT: extsb 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 %index
+ ret i8 %vecext
+}
+
+define zeroext i16 @test3(<8 x i16> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test3:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 48
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test3:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 48
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 %index
+ ret i16 %vecext
+}
+
+define signext i16 @test4(<8 x i16> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test4:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: extsh 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test4:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: extsh 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 %index
+ ret i16 %vecext
+}
+
+define zeroext i32 @test5(<4 x i32> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test5:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test5:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 %index
+ ret i32 %vecext
+}
+
+define signext i32 @test6(<4 x i32> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test6:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: extsw 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test6:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: extsw 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 %index
+ ret i32 %vecext
+}
+
+; Test with immediate index
+define zeroext i8 @test7(<16 x i8> %a) {
+; CHECK-LE-LABEL: test7:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 1
+; CHECK-LE-NEXT: vextubrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 56
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test7:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 1
+; CHECK-BE-NEXT: vextublx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 56
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 1
+ ret i8 %vecext
+}
+
+define zeroext i16 @test8(<8 x i16> %a) {
+; CHECK-LE-LABEL: test8:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 2
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 48
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test8:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 2
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 48
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 1
+ ret i16 %vecext
+}
+
+define zeroext i32 @test9(<4 x i32> %a) {
+; CHECK-LE-LABEL: test9:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 4
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test9:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 4
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 1
+ ret i32 %vecext
+}
diff --git a/test/CodeGen/PowerPC/vec_int_ext.ll b/test/CodeGen/PowerPC/vec_int_ext.ll
index 9e1218c423b7c..d7bed503318eb 100644
--- a/test/CodeGen/PowerPC/vec_int_ext.ll
+++ b/test/CodeGen/PowerPC/vec_int_ext.ll
@@ -1,12 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9
-target triple = "powerpc64le-unknown-linux-gnu"
-
-define <4 x i32> @vextsb2w(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2w:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsb2w 2, 2
-; PWR9-NEXT: blr
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE
+
+define <4 x i32> @vextsb2wLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2wLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsb2w 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsb2wLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsb2w 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <16 x i8> %a, i32 0
%conv = sext i8 %vecext to i32
@@ -23,11 +29,17 @@ entry:
ret <4 x i32> %vecinit9
}
-define <2 x i64> @vextsb2d(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsb2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsb2dLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsb2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsb2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsb2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <16 x i8> %a, i32 0
%conv = sext i8 %vecext to i64
@@ -38,11 +50,17 @@ entry:
ret <2 x i64> %vecinit3
}
-define <4 x i32> @vextsh2w(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2w:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsh2w 2, 2
-; PWR9-NEXT: blr
+define <4 x i32> @vextsh2wLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2wLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsh2w 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsh2wLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsh2w 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <8 x i16> %a, i32 0
%conv = sext i16 %vecext to i32
@@ -59,11 +77,17 @@ entry:
ret <4 x i32> %vecinit9
}
-define <2 x i64> @vextsh2d(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsh2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsh2dLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsh2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsh2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsh2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <8 x i16> %a, i32 0
%conv = sext i16 %vecext to i64
@@ -74,11 +98,17 @@ entry:
ret <2 x i64> %vecinit3
}
-define <2 x i64> @vextsw2d(<4 x i32> %a) {
-; PWR9-LABEL: vextsw2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsw2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsw2dLE(<4 x i32> %a) {
+; CHECK-LE-LABEL: vextsw2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsw2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsw2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vmrgew
+; CHECK-BE-NEXT: vextsw2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <4 x i32> %a, i32 0
%conv = sext i32 %vecext to i64
@@ -88,3 +118,170 @@ entry:
%vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
ret <2 x i64> %vecinit3
}
+
+define <4 x i32> @vextsb2wBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2wBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsb2w 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsb2wBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 13
+; CHECK-LE-NEXT: vextsb2w 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 3
+ %conv = sext i8 %vecext to i32
+ %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 7
+ %conv2 = sext i8 %vecext1 to i32
+ %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+ %vecext4 = extractelement <16 x i8> %a, i32 11
+ %conv5 = sext i8 %vecext4 to i32
+ %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+ %vecext7 = extractelement <16 x i8> %a, i32 15
+ %conv8 = sext i8 %vecext7 to i32
+ %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+ ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsb2dBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsb2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsb2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 9
+; CHECK-LE-NEXT: vextsb2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 7
+ %conv = sext i8 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 15
+ %conv2 = sext i8 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <4 x i32> @vextsh2wBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2wBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsh2w 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsh2wBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 14
+; CHECK-LE-NEXT: vextsh2w 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 1
+ %conv = sext i16 %vecext to i32
+ %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+ %vecext1 = extractelement <8 x i16> %a, i32 3
+ %conv2 = sext i16 %vecext1 to i32
+ %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+ %vecext4 = extractelement <8 x i16> %a, i32 5
+ %conv5 = sext i16 %vecext4 to i32
+ %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+ %vecext7 = extractelement <8 x i16> %a, i32 7
+ %conv8 = sext i16 %vecext7 to i32
+ %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+ ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsh2dBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsh2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsh2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 10
+; CHECK-LE-NEXT: vextsh2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 3
+ %conv = sext i16 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <8 x i16> %a, i32 7
+ %conv2 = sext i16 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextsw2dBE(<4 x i32> %a) {
+; CHECK-BE-LABEL: vextsw2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsw2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsw2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 12
+; CHECK-LE-NEXT: vextsw2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 1
+ %conv = sext i32 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <4 x i32> %a, i32 3
+ %conv2 = sext i32 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextDiffVectors(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LE-LABEL: vextDiffVectors:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NOT: vextsw2d
+
+; CHECK-BE-LABEL: vextDiffVectors:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NOT: vextsw2d
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 0
+ %conv = sext i32 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <4 x i32> %b, i32 2
+ %conv2 = sext i32 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <8 x i16> @testInvalidExtend(<16 x i8> %a) {
+entry:
+; CHECK-LE-LABEL: testInvalidExtend:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NOT: vexts
+
+; CHECK-BE-LABEL: testInvalidExtend:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NOT: vexts
+
+ %vecext = extractelement <16 x i8> %a, i32 0
+ %conv = sext i8 %vecext to i16
+ %vecinit = insertelement <8 x i16> undef, i16 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 2
+ %conv2 = sext i8 %vecext1 to i16
+ %vecinit3 = insertelement <8 x i16> %vecinit, i16 %conv2, i32 1
+ %vecext4 = extractelement <16 x i8> %a, i32 4
+ %conv5 = sext i8 %vecext4 to i16
+ %vecinit6 = insertelement <8 x i16> %vecinit3, i16 %conv5, i32 2
+ %vecext7 = extractelement <16 x i8> %a, i32 6
+ %conv8 = sext i8 %vecext7 to i16
+ %vecinit9 = insertelement <8 x i16> %vecinit6, i16 %conv8, i32 3
+ %vecext10 = extractelement <16 x i8> %a, i32 8
+ %conv11 = sext i8 %vecext10 to i16
+ %vecinit12 = insertelement <8 x i16> %vecinit9, i16 %conv11, i32 4
+ %vecext13 = extractelement <16 x i8> %a, i32 10
+ %conv14 = sext i8 %vecext13 to i16
+ %vecinit15 = insertelement <8 x i16> %vecinit12, i16 %conv14, i32 5
+ %vecext16 = extractelement <16 x i8> %a, i32 12
+ %conv17 = sext i8 %vecext16 to i16
+ %vecinit18 = insertelement <8 x i16> %vecinit15, i16 %conv17, i32 6
+ %vecext19 = extractelement <16 x i8> %a, i32 14
+ %conv20 = sext i8 %vecext19 to i16
+ %vecinit21 = insertelement <8 x i16> %vecinit18, i16 %conv20, i32 7
+ ret <8 x i16> %vecinit21
+}
diff --git a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
index 67146e40db0e8..5346d8a429fbd 100644
--- a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
+++ b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
@@ -321,8 +321,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecucus
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -385,8 +385,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecscus
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -487,8 +487,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecucss
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -540,8 +540,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecscss
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
diff --git a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
new file mode 100644
index 0000000000000..8798fcecfc3b9
--- /dev/null
+++ b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
@@ -0,0 +1,34 @@
+# RUN: llc -verify-machineinstrs -run-pass regallocfast -mtriple s390x-ibm-linux -o - %s | FileCheck %s
+--- |
+
+ @g_167 = external global [5 x i64], align 8
+ define void @main() local_unnamed_addr {
+ ret void
+ }
+...
+# Make sure the usage of different subregisters on the same virtual register
+# does not result in invalid kill flags.
+# PR33677
+---
+name: main
+alignment: 2
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr128bit }
+ - { id: 1, class: gr64bit }
+ - { id: 2, class: addr64bit }
+# CHECK: %r0q = L128
+# CHECK-NEXT: %r0l = COPY %r1l
+# Although R0L partially redefines R0Q, it must not mark R0Q as kill
+# because R1D is still live through that instruction.
+# CHECK-NOT: %r0q<imp-use,kill>
+# CHECK-NEXT: %r2d = COPY %r1d
+# CHECK-NEXT: LARL
+body: |
+ bb.0:
+ %0.subreg_hl32 = COPY %0.subreg_l32
+ %1 = COPY %0.subreg_l64
+ %2 = LARL @g_167
+ STC %1.subreg_l32, %2, 8, _
+
+...
diff --git a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
index 9fcc0f5d617b0..5c3800e970930 100644
--- a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
+++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
@@ -95,15 +95,17 @@ if.end:
}
; CHECK-LABEL: diamond2:
-; CHECK-BP: itte
-; CHECK-BP: streq
-; CHECK-BP: ldreq
-; CHECK-BP: strne
-; CHECK-NOBP: cbz
-; CHECK-NOBP: str
-; CHECK-NOBP: b
-; CHECK-NOBP: str
-; CHECK-NOBP: ldr
+; CHECK-BP: cbz
+; CHECK-BP: str
+; CHECK-BP: str
+; CHECK-BP: b
+; CHECK-BP: str
+; CHECK-BP: ldr
+; CHECK-NOBP: ittee
+; CHECK-NOBP: streq
+; CHECK-NOBP: ldreq
+; CHECK-NOBP: strne
+; CHECK-NOBP: strne
define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
entry:
%tobool = icmp eq i32 %n, 0
@@ -111,6 +113,8 @@ entry:
if.then:
store i32 %n, i32* %p, align 4
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 2
+ store i32 %n, i32* %arrayidx, align 4
br label %if.end
if.else:
diff --git a/test/CodeGen/WebAssembly/umulo-i64.ll b/test/CodeGen/WebAssembly/umulo-i64.ll
new file mode 100644
index 0000000000000..e47c8aa0bb3a9
--- /dev/null
+++ b/test/CodeGen/WebAssembly/umulo-i64.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; Test that UMULO works correctly on 64-bit operands.
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-emscripten"
+
+; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE:
+; CHECK: __multi3
+; Function Attrs: inlinehint
+define void @"_ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE"(i64, i64) unnamed_addr #0 {
+start:
+ %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1)
+ %3 = extractvalue { i64, i1 } %2, 0
+ store i64 %3, i64* undef
+ unreachable
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { inlinehint }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
index c03b923cadba2..cba208e62a147 100644
--- a/test/CodeGen/X86/2012-08-16-setcc.ll
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -1,45 +1,53 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; rdar://12081007
-; CHECK-LABEL: and_1:
-; CHECK: andb
-; CHECK-NEXT: cmovnel
-; CHECK: ret
define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+; CHECK-LABEL: and_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: andb %dil, %sil
+; CHECK-NEXT: cmovnel %edx, %eax
+; CHECK-NEXT: retq
%1 = and i8 %b, %a
%2 = icmp ne i8 %1, 0
%3 = select i1 %2, i32 %x, i32 0
ret i32 %3
}
-; CHECK-LABEL: and_2:
-; CHECK: andb
-; CHECK-NEXT: setne
-; CHECK: ret
define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: and_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andb %dil, %sil
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%1 = and i8 %b, %a
%2 = icmp ne i8 %1, 0
ret i1 %2
}
-; CHECK-LABEL: xor_1:
-; CHECK: xorb
-; CHECK-NEXT: cmovnel
-; CHECK: ret
define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+; CHECK-LABEL: xor_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: xorb %dil, %sil
+; CHECK-NEXT: cmovnel %edx, %eax
+; CHECK-NEXT: retq
%1 = xor i8 %b, %a
%2 = icmp ne i8 %1, 0
%3 = select i1 %2, i32 %x, i32 0
ret i32 %3
}
-; CHECK-LABEL: xor_2:
-; CHECK: xorb
-; CHECK-NEXT: setne
-; CHECK: ret
define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: xor_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorb %dil, %sil
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%1 = xor i8 %b, %a
%2 = icmp ne i8 %1, 0
ret i1 %2
}
+
diff --git a/test/CodeGen/X86/GC/badreadproto.ll b/test/CodeGen/X86/GC/badreadproto.ll
index 37672f8043574..aad79d75218a5 100644
--- a/test/CodeGen/X86/GC/badreadproto.ll
+++ b/test/CodeGen/X86/GC/badreadproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
diff --git a/test/CodeGen/X86/GC/badrootproto.ll b/test/CodeGen/X86/GC/badrootproto.ll
index ff86d03c646a2..37a3451c2c17e 100644
--- a/test/CodeGen/X86/GC/badrootproto.ll
+++ b/test/CodeGen/X86/GC/badrootproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
%meta = type opaque
diff --git a/test/CodeGen/X86/GC/badwriteproto.ll b/test/CodeGen/X86/GC/badwriteproto.ll
index 2544e40f81ff6..62c157477635a 100644
--- a/test/CodeGen/X86/GC/badwriteproto.ll
+++ b/test/CodeGen/X86/GC/badwriteproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
diff --git a/test/CodeGen/X86/GC/fat.ll b/test/CodeGen/X86/GC/fat.ll
index d05ca3da8195a..316a80343e2fb 100644
--- a/test/CodeGen/X86/GC/fat.ll
+++ b/test/CodeGen/X86/GC/fat.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @llvm.gcroot(i8**, i8*) nounwind
diff --git a/test/CodeGen/X86/GC/outside.ll b/test/CodeGen/X86/GC/outside.ll
index 2968c6917ce14..55eda54537898 100644
--- a/test/CodeGen/X86/GC/outside.ll
+++ b/test/CodeGen/X86/GC/outside.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @llvm.gcroot(i8**, i8*)
diff --git a/test/CodeGen/X86/GlobalISel/GV.ll b/test/CodeGen/X86/GlobalISel/GV.ll
new file mode 100644
index 0000000000000..44862ab5a96ea
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/GV.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-apple-darwin -global-isel -verify-machineinstrs -relocation-model=pic < %s -o - | FileCheck %s --check-prefix=X64_DARWIN_PIC
+; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI
+
+@g_int = global i32 0, align 4
+
+; Function Attrs: noinline nounwind optnone uwtable
+define i32* @test_global_ptrv() #3 {
+; X64-LABEL: test_global_ptrv:
+; X64: # BB#0: # %entry
+; X64-NEXT: leaq g_int, %rax
+; X64-NEXT: retq
+;
+; X64_DARWIN_PIC-LABEL: test_global_ptrv:
+; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
+; X64_DARWIN_PIC-NEXT: retq
+;
+; X32-LABEL: test_global_ptrv:
+; X32: # BB#0: # %entry
+; X32-NEXT: leal g_int, %eax
+; X32-NEXT: retl
+;
+; X32ABI-LABEL: test_global_ptrv:
+; X32ABI: # BB#0: # %entry
+; X32ABI-NEXT: leal g_int, %eax
+; X32ABI-NEXT: retq
+entry:
+ ret i32* @g_int
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define i32 @test_global_valv() #3 {
+; X64-LABEL: test_global_valv:
+; X64: # BB#0: # %entry
+; X64-NEXT: leaq g_int, %rax
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: retq
+;
+; X64_DARWIN_PIC-LABEL: test_global_valv:
+; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
+; X64_DARWIN_PIC-NEXT: movl (%rax), %eax
+; X64_DARWIN_PIC-NEXT: retq
+;
+; X32-LABEL: test_global_valv:
+; X32: # BB#0: # %entry
+; X32-NEXT: leal g_int, %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: retl
+;
+; X32ABI-LABEL: test_global_valv:
+; X32ABI: # BB#0: # %entry
+; X32ABI-NEXT: leal g_int, %eax
+; X32ABI-NEXT: movl (%eax), %eax
+; X32ABI-NEXT: retq
+entry:
+ %0 = load i32, i32* @g_int, align 4
+ ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/add-vec.ll b/test/CodeGen/X86/GlobalISel/add-vec.ll
index 679a49d733a2f..0ea1cf820c0fa 100644
--- a/test/CodeGen/X86/GlobalISel/add-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/add-vec.ll
@@ -1,38 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=core-avx2 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7-avx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+
define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; SKX-LABEL: test_add_v16i8:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v16i8:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <16 x i8> %arg1, %arg2
ret <16 x i8> %ret
}
define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; SKX-LABEL: test_add_v8i16:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <8 x i16> %arg1, %arg2
ret <8 x i16> %ret
}
define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; SKX-LABEL: test_add_v4i32:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v4i32:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <4 x i32> %arg1, %arg2
ret <4 x i32> %ret
}
define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
-; SKX-LABEL: test_add_v2i64:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v2i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <2 x i64> %arg1, %arg2
ret <2 x i64> %ret
}
@@ -42,6 +45,20 @@ define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <32 x i8> %arg1, %arg2
ret <32 x i8> %ret
}
@@ -51,6 +68,20 @@ define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <16 x i16> %arg1, %arg2
ret <16 x i16> %ret
}
@@ -60,6 +91,20 @@ define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <8 x i32> %arg1, %arg2
ret <8 x i32> %ret
}
@@ -69,6 +114,20 @@ define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <4 x i64> %arg1, %arg2
ret <4 x i64> %ret
}
@@ -78,6 +137,26 @@ define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <64 x i8> %arg1, %arg2
ret <64 x i8> %ret
}
@@ -87,6 +166,26 @@ define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <32 x i16> %arg1, %arg2
ret <32 x i16> %ret
}
@@ -96,6 +195,26 @@ define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <16 x i32> %arg1, %arg2
ret <16 x i32> %ret
}
@@ -105,6 +224,26 @@ define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <8 x i64> %arg1, %arg2
ret <8 x i64> %ret
}
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index b550bb0bc7be6..5b512f9ce9377 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -51,4 +51,13 @@ define i64 @const_i64_i32() {
ret i64 -1
}
+define void @main(i32 ** %data) {
+; ALL-LABEL: main:
+; ALL: # BB#0:
+; ALL-NEXT: movq $0, %rax
+; ALL-NEXT: movq %rax, (%rdi)
+; ALL-NEXT: retq
+ store i32* null, i32** %data, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index b08ac062fb4bb..11b03bd561103 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
-; TODO merge with ext.ll after i64 sext suported on 32bit platform
+; TODO merge with ext.ll after i64 sext supported on 32bit platform
define i64 @test_zext_i1(i8 %a) {
; X64-LABEL: test_zext_i1:
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index 392c973c12084..d9a09678cf4bb 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -2,6 +2,42 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+define i8 @test_zext_i1toi8(i32 %a) {
+; X64-LABEL: test_zext_i1toi8:
+; X64: # BB#0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_zext_i1toi8:
+; X32: # BB#0:
+; X32-NEXT: movl 4(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+ %val = trunc i32 %a to i1
+ %r = zext i1 %val to i8
+ ret i8 %r
+}
+
+define i16 @test_zext_i1toi16(i32 %a) {
+; X64-LABEL: test_zext_i1toi16:
+; X64: # BB#0:
+; X64-NEXT: andw $1, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_zext_i1toi16:
+; X32: # BB#0:
+; X32-NEXT: movl 4(%esp), %eax
+; X32-NEXT: andw $1, %ax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+ %val = trunc i32 %a to i1
+ %r = zext i1 %val to i16
+ ret i16 %r
+}
+
define i32 @test_zext_i1(i32 %a) {
; X64-LABEL: test_zext_i1:
; X64: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/legalize-GV.mir b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
new file mode 100644
index 0000000000000..7f9971e4c70a4
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+--- |
+
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+...
+---
+name: test_global_ptrv
+# ALL-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: false
+regBankSelected: false
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: _, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+# ALL: %0(p0) = G_GLOBAL_VALUE @g_int
+# ALL-NEXT: %rax = COPY %0(p0)
+# ALL-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index c9add0dc4e95c..c86bfd9ee96d3 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -1,12 +1,28 @@
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
- define i32 @test_zext_i1(i8 %a) {
+
+ define i8 @test_zext_i1toi8(i1 %a) {
+ %r = zext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_zext_i1toi16(i1 %a) {
+ %r = zext i1 %a to i16
+ ret i16 %r
+ }
+
+ define i32 @test_zext_i1(i8 %a) {
%val = trunc i8 %a to i1
%r = zext i1 %val to i32
ret i32 %r
}
+ define i16 @test_zext_i8toi16(i8 %val) {
+ %r = zext i8 %val to i16
+ ret i16 %r
+ }
+
define i32 @test_zext_i8(i8 %val) {
%r = zext i8 %val to i32
ret i32 %r
@@ -17,12 +33,27 @@
ret i32 %r
}
+ define i8 @test_sext_i1toi8(i1 %a) {
+ %r = sext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_sext_i1toi16(i1 %a) {
+ %r = sext i1 %a to i16
+ ret i16 %r
+ }
+
define i32 @test_sext_i1(i8 %a) {
%val = trunc i8 %a to i1
%r = sext i1 %val to i32
ret i32 %r
}
+ define i16 @test_sext_i8toi16(i8 %val) {
+ %r = sext i8 %val to i16
+ ret i16 %r
+ }
+
define i32 @test_sext_i8(i8 %val) {
%r = sext i8 %val to i32
ret i32 %r
@@ -35,6 +66,52 @@
...
---
+name: test_zext_i1toi8
+# ALL-LABEL: name: test_zext_i1toi8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s8) = G_ZEXT %0(s1)
+# ALL-NEXT: %al = COPY %1(s8)
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_ZEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_zext_i1toi16
+# ALL-LABEL: name: test_zext_i1toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s16) = G_ZEXT %0(s1)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_ZEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i1
# ALL-LABEL: name: test_zext_i1
alignment: 4
@@ -61,6 +138,29 @@ body: |
...
---
+name: test_zext_i8toi16
+# ALL-LABEL: name: test_zext_i8toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s16) = G_ZEXT %0(s8)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s16) = G_ZEXT %0(s8)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i8
# ALL-LABEL: name: test_zext_i8
alignment: 4
@@ -107,6 +207,52 @@ body: |
...
---
+name: test_sext_i1toi8
+# ALL-LABEL: name: test_sext_i1toi8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s8) = G_SEXT %0(s1)
+# ALL-NEXT: %al = COPY %1(s8)
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_SEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_sext_i1toi16
+# ALL-LABEL: name: test_sext_i1toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s16) = G_SEXT %0(s1)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_SEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_sext_i1
# ALL-LABEL: name: test_sext_i1
alignment: 4
@@ -133,6 +279,29 @@ body: |
...
---
+name: test_sext_i8toi16
+# ALL-LABEL: name: test_sext_i8toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s16) = G_SEXT %0(s8)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s16) = G_SEXT %0(s8)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_sext_i8
# ALL-LABEL: name: test_sext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
new file mode 100644
index 0000000000000..60d9fc63c14ad
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
@@ -0,0 +1,110 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+
+--- |
+ define void @test_memop_s8tos32() {
+ ret void
+ }
+
+ define void @test_memop_s64() {
+ ret void
+ }
+...
+---
+name: test_memop_s8tos32
+# ALL-LABEL: name: test_memop_s8tos32
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+ - { id: 6, class: _, preferred-register: '' }
+ - { id: 7, class: _, preferred-register: '' }
+ - { id: 8, class: _, preferred-register: '' }
+ - { id: 9, class: _, preferred-register: '' }
+ - { id: 10, class: _, preferred-register: '' }
+# ALL: %0(p0) = IMPLICIT_DEF
+# ALL-NEXT: %11(s8) = G_LOAD %0(p0) :: (load 1)
+# ALL-NEXT: %9(s1) = G_TRUNC %11(s8)
+# ALL-NEXT: %1(s8) = G_LOAD %0(p0) :: (load 1)
+# ALL-NEXT: %2(s16) = G_LOAD %0(p0) :: (load 2)
+# ALL-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 4)
+# ALL-NEXT: %4(p0) = G_LOAD %0(p0) :: (load 8)
+# ALL-NEXT: %10(s1) = IMPLICIT_DEF
+# ALL-NEXT: %12(s8) = G_ZEXT %10(s1)
+# ALL-NEXT: G_STORE %12(s8), %0(p0) :: (store 1)
+# ALL-NEXT: %5(s8) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %5(s8), %0(p0) :: (store 1)
+# ALL-NEXT: %6(s16) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %6(s16), %0(p0) :: (store 2)
+# ALL-NEXT: %7(s32) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %7(s32), %0(p0) :: (store 4)
+# ALL-NEXT: %8(p0) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %8(p0), %0(p0) :: (store 8)
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = IMPLICIT_DEF
+ %9(s1) = G_LOAD %0(p0) :: (load 1)
+ %1(s8) = G_LOAD %0(p0) :: (load 1)
+ %2(s16) = G_LOAD %0(p0) :: (load 2)
+ %3(s32) = G_LOAD %0(p0) :: (load 4)
+ %4(p0) = G_LOAD %0(p0) :: (load 8)
+
+ %10(s1) = IMPLICIT_DEF
+ G_STORE %10, %0 :: (store 1)
+ %5(s8) = IMPLICIT_DEF
+ G_STORE %5, %0 :: (store 1)
+ %6(s16) = IMPLICIT_DEF
+ G_STORE %6, %0 :: (store 2)
+ %7(s32) = IMPLICIT_DEF
+ G_STORE %7, %0 :: (store 4)
+ %8(p0) = IMPLICIT_DEF
+ G_STORE %8, %0 :: (store 8)
+...
+---
+name: test_memop_s64
+# ALL-LABEL: name: test_memop_s64
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+liveins:
+# X64: %0(p0) = IMPLICIT_DEF
+# X64-NEXT: %1(s64) = G_LOAD %0(p0) :: (load 8)
+# X64-NEXT: %2(s64) = IMPLICIT_DEF
+# X64-NEXT: G_STORE %2(s64), %0(p0) :: (store 8)
+#
+# X32: %0(p0) = IMPLICIT_DEF
+# X32-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 8)
+# X32-NEXT: %6(s32) = G_CONSTANT i32 4
+# X32-NEXT: %5(p0) = G_GEP %0, %6(s32)
+# X32-NEXT: %4(s32) = G_LOAD %5(p0) :: (load 8)
+# X32-NEXT: %1(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+# X32-NEXT: %2(s64) = IMPLICIT_DEF
+# X32-NEXT: %7(s32), %8(s32) = G_UNMERGE_VALUES %2(s64)
+# X32-NEXT: G_STORE %7(s32), %0(p0) :: (store 8)
+# X32-NEXT: %10(s32) = G_CONSTANT i32 4
+# X32-NEXT: %9(p0) = G_GEP %0, %10(s32)
+# X32-NEXT: G_STORE %8(s32), %9(p0) :: (store 8)
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = IMPLICIT_DEF
+ %1(s64) = G_LOAD %0(p0) :: (load 8)
+
+ %2(s64) = IMPLICIT_DEF
+ G_STORE %2, %0 :: (store 8)
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
index 2757e6493258c..1c719b1bf74da 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -4,6 +4,16 @@
;TODO merge with x86-64 tests (many operations not suppored yet)
+define i1 @test_load_i1(i1 * %p1) {
+; ALL-LABEL: test_load_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movl 4(%esp), %eax
+; ALL-NEXT: movb (%eax), %al
+; ALL-NEXT: retl
+ %r = load i1, i1* %p1
+ ret i1 %r
+}
+
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
; ALL: # BB#0:
@@ -34,6 +44,18 @@ define i32 @test_load_i32(i32 * %p1) {
ret i32 %r
}
+define i1 * @test_store_i1(i1 %val, i1 * %p1) {
+; ALL-LABEL: test_store_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movb 4(%esp), %cl
+; ALL-NEXT: movl 8(%esp), %eax
+; ALL-NEXT: andb $1, %cl
+; ALL-NEXT: movb %cl, (%eax)
+; ALL-NEXT: retl
+ store i1 %val, i1* %p1
+ ret i1 * %p1;
+}
+
define i8 * @test_store_i8(i8 %val, i8 * %p1) {
; ALL-LABEL: test_store_i8:
; ALL: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index 2e04b3cf20b37..2097a3b0bfc9f 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -2,6 +2,15 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
+define i1 @test_load_i1(i1 * %p1) {
+; ALL-LABEL: test_load_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movb (%rdi), %al
+; ALL-NEXT: retq
+ %r = load i1, i1* %p1
+ ret i1 %r
+}
+
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
; ALL: # BB#0:
@@ -70,6 +79,17 @@ define double @test_load_double(double * %p1) {
ret double %r
}
+define i1 * @test_store_i1(i1 %val, i1 * %p1) {
+; ALL-LABEL: test_store_i1:
+; ALL: # BB#0:
+; ALL-NEXT: andb $1, %dil
+; ALL-NEXT: movb %dil, (%rsi)
+; ALL-NEXT: movq %rsi, %rax
+; ALL-NEXT: retq
+ store i1 %val, i1* %p1
+ ret i1 * %p1;
+}
+
define i32 * @test_store_i32(i32 %val, i32 * %p1) {
; ALL-LABEL: test_store_i32:
; ALL: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
index 3658bc9af957a..95ef15ceb6893 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
@@ -174,6 +174,13 @@
ret i64 %ret
}
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+
...
---
name: test_add_i8
@@ -1084,4 +1091,24 @@ body: |
RET 0, implicit %rax
...
+---
+name: test_global_ptrv
+# CHECK-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+# CHECK: %0(p0) = G_GLOBAL_VALUE @g_int
+# CHECK-NEXT: %rax = COPY %0(p0)
+# CHECK-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-GV.mir b/test/CodeGen/X86/GlobalISel/select-GV.mir
new file mode 100644
index 0000000000000..2f2fd51d99d1d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-GV.mir
@@ -0,0 +1,99 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64
+# RUN: llc -mtriple=x86_64-apple-darwin -relocation-model=pic -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64_DARWIN_PIC
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32ABI
+
+--- |
+
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+
+ define i32 @test_global_valv() {
+ entry:
+ %0 = load i32, i32* @g_int, align 4
+ ret i32 %0
+ }
+
+...
+---
+name: test_global_ptrv
+# CHECK-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: true
+regBankSelected: true
+# X64ALL: registers:
+# X64ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+#
+# X32ALL: registers:
+# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+# X64: %0 = LEA64r _, 1, _, @g_int, _
+# X64-NEXT: %rax = COPY %0
+# X64-NEXT: RET 0, implicit %rax
+#
+# X64_DARWIN_PIC: %0 = LEA64r %rip, 1, _, @g_int, _
+# X64_DARWIN_PIC-NEXT: %rax = COPY %0
+# X64_DARWIN_PIC-NEXT: RET 0, implicit %rax
+#
+# X32: %0 = LEA32r _, 1, _, @g_int, _
+# X32-NEXT: %rax = COPY %0
+# X32-NEXT: RET 0, implicit %rax
+#
+# X32ABI: %0 = LEA64_32r _, 1, _, @g_int, _
+# X32ABI-NEXT: %rax = COPY %0
+# X32ABI-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
+---
+name: test_global_valv
+# CHECK-LABEL: name: test_global_valv
+alignment: 4
+legalized: true
+regBankSelected: true
+# X64ALL: registers:
+# X64ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+#
+# X32ALL: registers:
+# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X32ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# X64: %1 = LEA64r _, 1, _, @g_int, _
+# X64-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64-NEXT: %eax = COPY %0
+# X64-NEXT: RET 0, implicit %eax
+#
+# X64_DARWIN_PIC: %1 = LEA64r %rip, 1, _, @g_int, _
+# X64_DARWIN_PIC-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64_DARWIN_PIC-NEXT: %eax = COPY %0
+# X64_DARWIN_PIC-NEXT: RET 0, implicit %eax
+#
+# X32: %1 = LEA32r _, 1, _, @g_int, _
+# X32-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32-NEXT: %eax = COPY %0
+# X32-NEXT: RET 0, implicit %eax
+#
+# X32ABI: %1 = LEA64_32r _, 1, _, @g_int, _
+# X32ABI-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32ABI-NEXT: %eax = COPY %0
+# X32ABI-NEXT: RET 0, implicit %eax
+body: |
+ bb.1.entry:
+ %1(p0) = G_GLOBAL_VALUE @g_int
+ %0(s32) = G_LOAD %1(p0) :: (load 4 from @g_int)
+ %eax = COPY %0(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir
index 4b91b5f9f0982..30f57418b4ce0 100644
--- a/test/CodeGen/X86/GlobalISel/select-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -29,6 +29,11 @@
ret i64 -1
}
+ define void @main(i32** %data) {
+ store i32* null, i32** %data, align 8
+ ret void
+ }
+
...
---
name: const_i8
@@ -162,3 +167,29 @@ body: |
RET 0, implicit %rax
...
+---
+name: main
+# CHECK-LABEL: name: main
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# CHECK: %0 = COPY %rdi
+# CHECK-NEXT: %1 = MOV64ri32 0
+# CHECK-NEXT: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.data)
+# CHECK-NEXT: RET 0
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = COPY %rdi
+ %1(p0) = G_CONSTANT i64 0
+ G_STORE %1(p0), %0(p0) :: (store 8 into %ir.data)
+ RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index b52f1f6fa621e..b6734e5aa2b83 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -2,6 +2,16 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
+ define i8 @test_zext_i1toi8(i1 %a) {
+ %r = zext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_zext_i1toi16(i1 %a) {
+ %r = zext i1 %a to i16
+ ret i16 %r
+ }
+
define i32 @test_zext_i1(i1 %a) {
%r = zext i1 %a to i32
ret i32 %r
@@ -29,6 +39,60 @@
...
---
+name: test_zext_i1toi8
+# ALL-LABEL: name: test_zext_i1toi8
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %1 = AND8ri %0, 1, implicit-def %eflags
+# ALL-NEXT: %al = COPY %1
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_ZEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_zext_i1toi16
+# ALL-LABEL: name: test_zext_i1toi16
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1 = AND16ri8 %2, 1, implicit-def %eflags
+# ALL-NEXT: %ax = COPY %1
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_ZEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i1
# ALL-LABEL: name: test_zext_i1
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
new file mode 100644
index 0000000000000..09dc5344796f9
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512VL
+--- |
+ define void @test_unmerge() {
+ ret void
+ }
+
+...
+---
+name: test_unmerge
+# AVX-LABEL: name: test_unmerge
+#
+# AVX512VL-LABEL: name: test_unmerge
+alignment: 4
+legalized: true
+regBankSelected: true
+# AVX: registers:
+# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
+# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
+# AVX-NEXT: - { id: 2, class: vr128, preferred-register: '' }
+#
+# AVX512VL: registers:
+# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
+# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
+# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+# AVX: %0 = IMPLICIT_DEF
+# AVX-NEXT: %1 = COPY %0.sub_xmm
+# AVX-NEXT: %2 = VEXTRACTF128rr %0, 1
+# AVX-NEXT: %xmm0 = COPY %1
+# AVX-NEXT: %xmm1 = COPY %2
+# AVX-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+#
+# AVX512VL: %0 = IMPLICIT_DEF
+# AVX512VL-NEXT: %1 = COPY %0.sub_xmm
+# AVX512VL-NEXT: %2 = VEXTRACTF32x4Z256rr %0, 1
+# AVX512VL-NEXT: %xmm0 = COPY %1
+# AVX512VL-NEXT: %xmm1 = COPY %2
+# AVX512VL-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<8 x s32>) = IMPLICIT_DEF
+ %1(<4 x s32>), %2(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
+ %xmm0 = COPY %1(<4 x s32>)
+ %xmm1 = COPY %2(<4 x s32>)
+ RET 0, implicit %xmm0, implicit %xmm1
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
new file mode 100644
index 0000000000000..a63733d07f6a6
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
@@ -0,0 +1,74 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+--- |
+ define void @test_unmerge_v128() {
+ ret void
+ }
+
+ define void @test_unmerge_v256() {
+ ret void
+ }
+
+...
+---
+name: test_unmerge_v128
+# ALL-LABEL: name: test_unmerge_v128
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: vr128x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+ - { id: 3, class: vecr }
+ - { id: 4, class: vecr }
+# ALL: %0 = IMPLICIT_DEF
+# ALL-NEXT: %1 = COPY %0.sub_xmm
+# ALL-NEXT: %2 = VEXTRACTF32x4Zrr %0, 1
+# ALL-NEXT: %3 = VEXTRACTF32x4Zrr %0, 2
+# ALL-NEXT: %4 = VEXTRACTF32x4Zrr %0, 3
+# ALL-NEXT: %xmm0 = COPY %1
+# ALL-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<16 x s32>) = IMPLICIT_DEF
+ %1(<4 x s32>), %2(<4 x s32>), %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
+ %xmm0 = COPY %1(<4 x s32>)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_unmerge_v256
+# ALL-LABEL: name: test_unmerge_v256
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+# ALL: %0 = IMPLICIT_DEF
+# ALL-NEXT: %1 = COPY %0.sub_ymm
+# ALL-NEXT: %2 = VEXTRACTF64x4Zrr %0, 1
+# ALL-NEXT: %xmm0 = COPY %1
+# ALL-NEXT: RET 0, implicit %ymm0
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<16 x s32>) = IMPLICIT_DEF
+ %1(<8 x s32>), %2(<8 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
+ %xmm0 = COPY %1(<8 x s32>)
+ RET 0, implicit %ymm0
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
new file mode 100644
index 0000000000000..2743f882b2e41
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err
+; This file checks that the fallback path to selection dag works.
+; The test is fragile in the sense that it must be updated to expose
+; something that fails with global-isel.
+; When we cannot produce a test case anymore, that means we can remove
+; the fallback path.
+
+; Check that we fallback on invoke translation failures.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s80) = G_FCONSTANT x86_fp80 0xK4002A000000000000000
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_x86_fp80_dump
+; FALLBACK-WITH-REPORT-OUT-LABEL: test_x86_fp80_dump:
+define void @test_x86_fp80_dump(x86_fp80* %ptr){
+ store x86_fp80 0xK4002A000000000000000, x86_fp80* %ptr, align 16
+ ret void
+}
+
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index e5f7cc5c6dd8d..640b5215afe9f 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2624,7 +2624,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
@@ -2941,7 +2942,8 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx-cmp.ll b/test/CodeGen/X86/avx-cmp.ll
index a050d6abe56f9..963878b0f5632 100644
--- a/test/CodeGen/X86/avx-cmp.ll
+++ b/test/CodeGen/X86/avx-cmp.ll
@@ -1,25 +1,59 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vcmpltps %ymm
-; CHECK-NOT: vucomiss
-define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind readnone {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind {
+; CHECK-LABEL: cmp00:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = fcmp olt <8 x float> %a, %b
%s = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %s
}
-; CHECK: vcmpltpd %ymm
-; CHECK-NOT: vucomisd
-define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind readnone {
+define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind {
+; CHECK-LABEL: cmp01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = fcmp olt <4 x double> %a, %b
%s = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %s
}
-declare void @scale() nounwind uwtable
-
-; CHECK: vucomisd
-define void @render() nounwind uwtable {
+declare void @scale() nounwind
+
+define void @render() nounwind {
+; CHECK-LABEL: render:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB2_6
+; CHECK-NEXT: # BB#1: # %for.cond5.preheader
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_5: # %if.then
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq scale
+; CHECK-NEXT: .LBB2_2: # %for.cond5
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: jne .LBB2_2
+; CHECK-NEXT: # BB#3: # %for.cond5
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: je .LBB2_2
+; CHECK-NEXT: # BB#4: # %for.body33
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0
+; CHECK-NEXT: jne .LBB2_5
+; CHECK-NEXT: jp .LBB2_5
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .LBB2_6: # %for.end52
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
entry:
br i1 undef, label %for.cond5, label %for.end52
@@ -42,89 +76,113 @@ for.end52:
ret void
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtd %xmm
-; CHECK-NEXT: vpcmpgtd %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <8 x i32> @int256-cmp(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+define <8 x i32> @int256_cmp(<8 x i32> %i, <8 x i32> %j) nounwind {
+; CHECK-LABEL: int256_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <8 x i32> %i, %j
%x = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtq %xmm
-; CHECK-NEXT: vpcmpgtq %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @v4i64-cmp(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmp(<4 x i64> %i, <4 x i64> %j) nounwind {
+; CHECK-LABEL: v4i64_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <4 x i64> %i, %j
%x = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtw %xmm
-; CHECK-NEXT: vpcmpgtw %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <16 x i16> @v16i16-cmp(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmp(<16 x i16> %i, <16 x i16> %j) nounwind {
+; CHECK-LABEL: v16i16_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <16 x i16> %i, %j
%x = sext <16 x i1> %bincmp to <16 x i16>
ret <16 x i16> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtb %xmm
-; CHECK-NEXT: vpcmpgtb %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <32 x i8> @v32i8-cmp(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmp(<32 x i8> %i, <32 x i8> %j) nounwind {
+; CHECK-LABEL: v32i8_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <32 x i8> %i, %j
%x = sext <32 x i1> %bincmp to <32 x i8>
ret <32 x i8> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqd %xmm
-; CHECK-NEXT: vpcmpeqd %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <8 x i32> @int256-cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind {
+; CHECK-LABEL: int256_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <8 x i32> %i, %j
%x = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqq %xmm
-; CHECK-NEXT: vpcmpeqq %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @v4i64-cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind {
+; CHECK-LABEL: v4i64_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <4 x i64> %i, %j
%x = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqw %xmm
-; CHECK-NEXT: vpcmpeqw %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <16 x i16> @v16i16-cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind {
+; CHECK-LABEL: v16i16_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <16 x i16> %i, %j
%x = sext <16 x i1> %bincmp to <16 x i16>
ret <16 x i16> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqb %xmm
-; CHECK-NEXT: vpcmpeqb %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind {
+; CHECK-LABEL: v32i8_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <32 x i8> %i, %j
%x = sext <32 x i1> %bincmp to <32 x i8>
ret <32 x i8> %x
@@ -132,17 +190,28 @@ define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
;; Scalar comparison
-; CHECK: scalarcmpA
-; CHECK: vcmpeqsd
define i32 @scalarcmpA() uwtable ssp {
+; CHECK-LABEL: scalarcmpA:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: retq
%cmp29 = fcmp oeq double undef, 0.000000e+00
%res = zext i1 %cmp29 to i32
ret i32 %res
}
-; CHECK: scalarcmpB
-; CHECK: vcmpeqss
define i32 @scalarcmpB() uwtable ssp {
+; CHECK-LABEL: scalarcmpB:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
%cmp29 = fcmp oeq float undef, 0.000000e+00
%res = zext i1 %cmp29 to i32
ret i32 %res
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index d7eceb7cce664..06aadc476e4ca 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -1,13 +1,62 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc -O0 < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -check-prefix=CHECK_O0
-
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s
+; RUN: llc -O0 < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s -check-prefix=CHECK_O0
+
+define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind {
+; CHECK-LABEL: test_256_load:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $96, %rsp
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rsi, %r15
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vmovaps (%rbx), %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps (%r15), %ymm1
+; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps (%r14), %ymm2
+; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; CHECK-NEXT: callq dummy
+; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%rbx)
+; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%r15)
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%r14)
+; CHECK-NEXT: addq $96, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: test_256_load:
+; CHECK_O0: # BB#0: # %entry
+; CHECK_O0-NEXT: subq $152, %rsp
+; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0
+; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1
+; CHECK_O0-NEXT: vmovdqa (%rdx), %ymm2
+; CHECK_O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: callq dummy
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK_O0-NEXT: vmovapd %ymm0, (%rdx)
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload
+; CHECK_O0-NEXT: vmovaps %ymm1, (%rsi)
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload
+; CHECK_O0-NEXT: vmovdqa %ymm2, (%rdi)
+; CHECK_O0-NEXT: addq $152, %rsp
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
entry:
%0 = bitcast double* %d to <4 x double>*
%tmp1.i = load <4 x double>, <4 x double>* %0, align 32
@@ -27,62 +76,115 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
;; The two tests below check that we must fold load + scalar_to_vector
;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory
-; CHECK: mov00
define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
+; CHECK-LABEL: mov00:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: mov00:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; CHECK_O0-NEXT: retq
%val = load float, float* %ptr
-; CHECK: vmovss (%
%i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %i0
-; CHECK: ret
}
-; CHECK: mov01
define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
+; CHECK-LABEL: mov01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: mov01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3]
+; CHECK_O0-NEXT: retq
%val = load double, double* %ptr
-; CHECK: vmovsd (%
%i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0
ret <4 x double> %i0
-; CHECK: ret
}
-; CHECK: vmovaps %ymm
define void @storev16i16(<16 x i16> %a) nounwind {
+; CHECK-LABEL: storev16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+;
+; CHECK_O0-LABEL: storev16i16:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 32
unreachable
}
-; CHECK: storev16i16_01
-; CHECK: vextractf128
-; CHECK: vmovups %xmm
define void @storev16i16_01(<16 x i16> %a) nounwind {
+; CHECK-LABEL: storev16i16_01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+;
+; CHECK_O0-LABEL: storev16i16_01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 4
unreachable
}
-; CHECK: storev32i8
-; CHECK: vmovaps %ymm
define void @storev32i8(<32 x i8> %a) nounwind {
+; CHECK-LABEL: storev32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+;
+; CHECK_O0-LABEL: storev32i8:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 32
unreachable
}
-; CHECK: storev32i8_01
-; CHECK: vextractf128
-; CHECK: vmovups %xmm
define void @storev32i8_01(<32 x i8> %a) nounwind {
+; CHECK-LABEL: storev32i8_01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+;
+; CHECK_O0-LABEL: storev32i8_01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 4
unreachable
}
; It is faster to make two saves, if the data is already in XMM registers. For
; example, after making an integer operation.
-; CHECK: _double_save
-; CHECK-NOT: vinsertf128 $1
-; CHECK-NOT: vinsertf128 $0
-; CHECK: vmovaps %xmm
-; CHECK: vmovaps %xmm
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
-entry:
+; CHECK-LABEL: double_save:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: double_save:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i32> %Z, <8 x i32>* %P, align 16
ret void
@@ -90,60 +192,127 @@ entry:
declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
-; CHECK_O0: _f_f
-; CHECK-O0: vmovss LCPI
-; CHECK-O0: vxorps %xmm
-; CHECK-O0: vmovss %xmm
define void @f_f() nounwind {
+; CHECK-LABEL: f_f:
+; CHECK: # BB#0: # %allocas
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB8_2
+; CHECK-NEXT: # BB#1: # %cif_mask_all
+; CHECK-NEXT: .LBB8_2: # %cif_mask_mixed
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB8_4
+; CHECK-NEXT: # BB#3: # %cif_mixed_test_all
+; CHECK-NEXT: movl $-1, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
+; CHECK-NEXT: .LBB8_4: # %cif_mixed_test_any_check
+;
+; CHECK_O0-LABEL: f_f:
+; CHECK_O0: # BB#0: # %allocas
+; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0-NEXT: testb $1, %al
+; CHECK_O0-NEXT: jne .LBB8_1
+; CHECK_O0-NEXT: jmp .LBB8_2
+; CHECK_O0-NEXT: .LBB8_1: # %cif_mask_all
+; CHECK_O0-NEXT: .LBB8_2: # %cif_mask_mixed
+; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0-NEXT: testb $1, %al
+; CHECK_O0-NEXT: jne .LBB8_3
+; CHECK_O0-NEXT: jmp .LBB8_4
+; CHECK_O0-NEXT: .LBB8_3: # %cif_mixed_test_all
+; CHECK_O0-NEXT: movl $-1, %eax
+; CHECK_O0-NEXT: vmovd %eax, %xmm0
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: # implicit-def: %RCX
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx)
+; CHECK_O0-NEXT: .LBB8_4: # %cif_mixed_test_any_check
allocas:
br i1 undef, label %cif_mask_all, label %cif_mask_mixed
-cif_mask_all: ; preds = %allocas
+cif_mask_all:
unreachable
-cif_mask_mixed: ; preds = %allocas
+cif_mask_mixed:
br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check
-cif_mixed_test_all: ; preds = %cif_mask_mixed
+cif_mixed_test_all:
call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind
unreachable
-cif_mixed_test_any_check: ; preds = %cif_mask_mixed
+cif_mixed_test_any_check:
unreachable
}
-; CHECK: add8i32
-; CHECK: vmovups
-; CHECK: vmovups
-; CHECK-NOT: vinsertf128
-; CHECK-NOT: vextractf128
-; CHECK: vmovups
-; CHECK: vmovups
define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
+; CHECK-LABEL: add8i32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovups (%rsi), %xmm0
+; CHECK-NEXT: vmovups 16(%rsi), %xmm1
+; CHECK-NEXT: vmovups %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add8i32:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovdqu (%rsi), %xmm0
+; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <8 x i32>, <8 x i32>* %bp, align 1
%x = add <8 x i32> zeroinitializer, %b
store <8 x i32> %x, <8 x i32>* %ret, align 1
ret void
}
-; CHECK: add4i64a64
-; CHECK: vmovaps ({{.*}}), %ymm{{.*}}
-; CHECK: vmovaps %ymm{{.*}}, ({{.*}})
define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+; CHECK-LABEL: add4i64a64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rsi), %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add4i64a64:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <4 x i64>, <4 x i64>* %bp, align 64
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 64
ret void
}
-; CHECK: add4i64a16
-; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
-; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
-; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
-; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+; CHECK-LABEL: add4i64a16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rsi), %xmm0
+; CHECK-NEXT: vmovaps 16(%rsi), %xmm1
+; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add4i64a16:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm0
+; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <4 x i64>, <4 x i64>* %bp, align 16
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 16
ret void
}
+
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 47e95fe31bdff..a12a412fb94d6 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -10,8 +10,8 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
@@ -21,14 +21,14 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -40,8 +40,8 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
@@ -51,14 +51,14 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -70,8 +70,8 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
@@ -81,14 +81,14 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -101,8 +101,8 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
@@ -112,14 +112,14 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -131,10 +131,10 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
@@ -147,14 +147,14 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -172,10 +172,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
@@ -188,14 +188,14 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; BTVER2: # BB#0:
; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -213,10 +213,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_andpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
@@ -229,14 +229,14 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; BTVER2: # BB#0:
; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -252,10 +252,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_andps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
@@ -268,14 +268,14 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; BTVER2: # BB#0:
; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -291,10 +291,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
@@ -306,14 +306,14 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -326,9 +326,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
@@ -356,9 +356,9 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
@@ -387,9 +387,9 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
@@ -418,8 +418,8 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
; SANDY-LABEL: test_broadcastf128:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastf128:
; HASWELL: # BB#0:
@@ -443,8 +443,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
define <4 x double> @test_broadcastsd_ymm(double *%a0) {
; SANDY-LABEL: test_broadcastsd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastsd_ymm:
; HASWELL: # BB#0:
@@ -469,8 +469,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
define <4 x float> @test_broadcastss(float *%a0) {
; SANDY-LABEL: test_broadcastss:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss:
; HASWELL: # BB#0:
@@ -496,7 +496,7 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
; SANDY-LABEL: test_broadcastss_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss_ymm:
; HASWELL: # BB#0:
@@ -522,9 +522,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
@@ -560,9 +560,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
@@ -598,9 +598,9 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
@@ -613,14 +613,14 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2pd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = sitofp <4 x i32> %a0 to <4 x double>
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
@@ -632,12 +632,12 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
@@ -650,14 +650,14 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2ps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = sitofp <8 x i32> %a0 to <8 x float>
%2 = load <8 x i32>, <8 x i32> *%a1, align 16
@@ -669,10 +669,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
@@ -704,10 +704,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
@@ -741,8 +741,8 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
@@ -774,9 +774,9 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:3.00]
+; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
@@ -786,14 +786,14 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fdiv <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -804,9 +804,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
@@ -816,14 +816,14 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fdiv <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -834,9 +834,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
@@ -866,9 +866,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
; SANDY-LABEL: test_extractf128:
; SANDY: # BB#0:
; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_extractf128:
; HASWELL: # BB#0:
@@ -900,7 +900,7 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY: # BB#0:
; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
@@ -929,9 +929,9 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
@@ -960,9 +960,9 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
@@ -991,9 +991,9 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
@@ -1023,9 +1023,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; SANDY-LABEL: test_insertf128:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertf128:
; HASWELL: # BB#0:
@@ -1038,14 +1038,14 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; BTVER2: # BB#0:
; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertf128:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -1059,8 +1059,8 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
define <32 x i8> @test_lddqu(i8* %a0) {
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
@@ -1084,10 +1084,10 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
; SANDY-LABEL: test_maskmovpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd:
; HASWELL: # BB#0:
@@ -1119,10 +1119,10 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
; SANDY-LABEL: test_maskmovpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd_ymm:
; HASWELL: # BB#0:
@@ -1154,10 +1154,10 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
; SANDY-LABEL: test_maskmovps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps:
; HASWELL: # BB#0:
@@ -1189,10 +1189,10 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
; SANDY-LABEL: test_maskmovps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps_ymm:
; HASWELL: # BB#0:
@@ -1225,8 +1225,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
@@ -1256,8 +1256,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
@@ -1288,7 +1288,7 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
@@ -1319,7 +1319,7 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY: # BB#0:
; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
@@ -1348,10 +1348,10 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
@@ -1363,14 +1363,14 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movapd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <4 x double>, <4 x double> *%a0, align 32
@@ -1382,10 +1382,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
@@ -1397,14 +1397,14 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movaps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <8 x float>, <8 x float> *%a0, align 32
@@ -1417,9 +1417,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
@@ -1432,14 +1432,14 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movddup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -1451,9 +1451,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
define i32 @test_movmskpd(<4 x double> %a0) {
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
@@ -1479,9 +1479,9 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_movmskps(<8 x float> %a0) {
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
@@ -1508,8 +1508,8 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
@@ -1519,13 +1519,13 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <4 x double> %a0, %a0
@@ -1537,8 +1537,8 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
@@ -1548,13 +1548,13 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <8 x float> %a0, %a0
@@ -1566,9 +1566,9 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
@@ -1581,14 +1581,14 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movshdup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1601,9 +1601,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
@@ -1616,14 +1616,14 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsldup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1635,12 +1635,12 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
@@ -1652,14 +1652,14 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movupd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <4 x double>, <4 x double> *%a0, align 1
@@ -1671,12 +1671,12 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
@@ -1688,14 +1688,14 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movups:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <8 x float>, <8 x float> *%a0, align 1
@@ -1708,8 +1708,8 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
@@ -1719,14 +1719,14 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
+; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
+; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fmul <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -1738,8 +1738,8 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
@@ -1749,14 +1749,14 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fmul <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -1767,10 +1767,10 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: orpd:
; HASWELL: # BB#0:
@@ -1783,14 +1783,14 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
; BTVER2: # BB#0:
; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: orpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -1806,10 +1806,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
@@ -1822,14 +1822,14 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
; BTVER2: # BB#0:
; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -1846,9 +1846,9 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_permilpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd:
; HASWELL: # BB#0:
@@ -1880,10 +1880,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_permilpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [8:1.00]
; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd_ymm:
; HASWELL: # BB#0:
@@ -1896,14 +1896,14 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilpd_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -1916,9 +1916,9 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_permilps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps:
; HASWELL: # BB#0:
@@ -1950,10 +1950,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_permilps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [8:1.00]
; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps_ymm:
; HASWELL: # BB#0:
@@ -1966,14 +1966,14 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilps_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1986,8 +1986,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
; SANDY-LABEL: test_permilvarpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd:
; HASWELL: # BB#0:
@@ -2018,7 +2018,7 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd_ymm:
; HASWELL: # BB#0:
@@ -2048,8 +2048,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
; SANDY-LABEL: test_permilvarps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps:
; HASWELL: # BB#0:
@@ -2080,7 +2080,7 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps_ymm:
; HASWELL: # BB#0:
@@ -2112,7 +2112,7 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
@@ -2123,16 +2123,16 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
-; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00]
+; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2148,7 +2148,7 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
@@ -2161,14 +2161,14 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -2184,7 +2184,7 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
@@ -2197,14 +2197,14 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2217,10 +2217,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:3.00]
+; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
@@ -2231,16 +2231,16 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
-; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
+; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2254,9 +2254,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
@@ -2269,14 +2269,14 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; BTVER2: # BB#0:
; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2289,8 +2289,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
@@ -2318,10 +2318,10 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:3.00]
+; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:3.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
@@ -2332,16 +2332,16 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
-; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
+; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
-; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
+; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -2354,10 +2354,10 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:3.00]
+; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
@@ -2368,16 +2368,16 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
-; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
+; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
-; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
+; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2391,8 +2391,8 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
@@ -2402,14 +2402,14 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fsub <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2421,8 +2421,8 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
@@ -2432,14 +2432,14 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fsub <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -2451,11 +2451,11 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_testpd:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd:
; HASWELL: # BB#0:
@@ -2495,12 +2495,12 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
; SANDY-LABEL: test_testpd_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd_ymm:
; HASWELL: # BB#0:
@@ -2542,11 +2542,11 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_testps:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps:
; HASWELL: # BB#0:
@@ -2586,12 +2586,12 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
; SANDY-LABEL: test_testps_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps_ymm:
; HASWELL: # BB#0:
@@ -2635,7 +2635,7 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
@@ -2648,14 +2648,14 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2669,7 +2669,7 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
@@ -2698,9 +2698,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
@@ -2713,14 +2713,14 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2733,8 +2733,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
@@ -2762,10 +2762,10 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_xorpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
@@ -2778,14 +2778,14 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; BTVER2: # BB#0:
; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -2801,10 +2801,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_xorps:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
@@ -2817,14 +2817,14 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; BTVER2: # BB#0:
; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -2841,7 +2841,7 @@ define void @test_zeroall() {
; SANDY-LABEL: test_zeroall:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroall:
; HASWELL: # BB#0:
@@ -2866,7 +2866,7 @@ define void @test_zeroupper() {
; SANDY-LABEL: test_zeroupper:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroupper:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/avx-unpack.ll b/test/CodeGen/X86/avx-unpack.ll
index 6924d98b38b17..7826bc97eec57 100644
--- a/test/CodeGen/X86/avx-unpack.ll
+++ b/test/CodeGen/X86/avx-unpack.ll
@@ -1,57 +1,84 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; CHECK: vunpckhps
define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x float> %shuffle.i
}
-; CHECK: vunpckhpd
define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x double> %shuffle.i
}
-; CHECK: vunpcklps
define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x float> %shuffle.i
}
-; CHECK: vunpcklpd
define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x double> %shuffle.i
}
-; CHECK-NOT: vunpcklps %ymm
-define <8 x float> @unpacklops-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+define <8 x float> @unpacklops_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpacklops_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x float> %shuffle.i
}
-; CHECK-NOT: vunpcklpd %ymm
-define <4 x double> @unpacklopd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+define <4 x double> @unpacklopd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpacklopd_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
ret <4 x double> %shuffle.i
}
-; CHECK-NOT: vunpckhps %ymm
-define <8 x float> @unpackhips-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+define <8 x float> @unpackhips_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpackhips_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,3,u,4,u,5]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,u,3,u,4,u,5,u]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13>
ret <8 x float> %shuffle.i
}
-; CHECK-NOT: vunpckhpd %ymm
-define <4 x double> @unpackhipd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+define <4 x double> @unpackhipd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpackhipd_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
ret <4 x double> %shuffle.i
}
@@ -60,102 +87,135 @@ entry:
;;;; Unpack versions using the fp unit for int unpacking
;;;;
-; CHECK: vunpckhps
define <8 x i32> @unpackhips1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpckhps (%
define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
%a = load <8 x i32>, <8 x i32>* %src1
%b = load <8 x i32>, <8 x i32>* %src2
%shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpckhpd
define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpckhpd (%
define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
%b = load <4 x i64>, <4 x i64>* %src2
%shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpcklps
define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpcklps (%
define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
%a = load <8 x i32>, <8 x i32>* %src1
%b = load <8 x i32>, <8 x i32>* %src2
%shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpcklpd
define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpcklpd (%
define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
%b = load <4 x i64>, <4 x i64>* %src2
%shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x i64> %shuffle.i
}
-; CHECK: vpunpckhwd
-; CHECK: vpunpckhwd
-; CHECK: vinsertf128
define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhwd_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i16> %shuffle.i
}
-; CHECK: vpunpcklwd
-; CHECK: vpunpcklwd
-; CHECK: vinsertf128
define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklwd_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
ret <16 x i16> %shuffle.i
}
-; CHECK: vpunpckhbw
-; CHECK: vpunpckhbw
-; CHECK: vinsertf128
define <32 x i8> @unpackhbw_undef(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhbw_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <32 x i8> %shuffle.i
}
-; CHECK: vpunpcklbw
-; CHECK: vpunpcklbw
-; CHECK: vinsertf128
define <32 x i8> @unpacklbw_undef(<32 x i8> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklbw_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
ret <32 x i8> %shuffle.i
}
+
diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll
index 38389de7a8a10..b7a4d5b5c308e 100644
--- a/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,30 +1,37 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; CHECK-LABEL: A:
-; CHECK-NOT: vunpck
-; CHECK: vinsertf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: A:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-; CHECK-LABEL: B:
-; CHECK-NOT: vunpck
-; CHECK: vinsertf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: B:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 1>
ret <4 x double> %shuffle
}
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-; Just check that no crash happens
-; CHECK-LABEL: _insert_crash:
define void @insert_crash() nounwind {
+; CHECK-LABEL: insert_crash:
+; CHECK: # BB#0: # %allocas
+; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+; CHECK-NEXT: retq
allocas:
%v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%ret_0a.i.i.i452 = shufflevector <4 x double> %v1.i.i451, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -40,72 +47,87 @@ allocas:
;; DAG Combine must remove useless vinsertf128 instructions
-; CHECK-LABEL: DAGCombineA:
-; CHECK-NOT: vinsertf128 $1
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
- %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %2
+; CHECK-LABEL: DAGCombineA:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %t1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %t2 = shufflevector <8 x i32> %t1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %t2
}
-; CHECK-LABEL: DAGCombineB:
-; CHECK: vpaddd %xmm
-; CHECK-NOT: vinsertf128 $1
-; CHECK: vpaddd %xmm
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
- %1 = add <8 x i32> %v1, %v2
- %2 = add <8 x i32> %1, %v1
- ret <8 x i32> %2
+; CHECK-LABEL: DAGCombineB:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t1 = add <8 x i32> %v1, %v2
+ %t2 = add <8 x i32> %t1, %v1
+ ret <8 x i32> %t2
}
-; CHECK-LABEL: insert_undef_pd:
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-; CHECK-LABEL: insert_undef_ps:
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-; CHECK-LABEL: insert_undef_si:
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_si:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
; rdar://10643481
-; CHECK-LABEL: vinsertf128_combine:
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
-; CHECK-NOT: vmovaps
-; CHECK: vinsertf128
-entry:
+; CHECK-LABEL: vinsertf128_combine:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
- %0 = bitcast float* %add.ptr to <4 x float>*
- %1 = load <4 x float>, <4 x float>* %0, align 16
- %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1)
- ret <8 x float> %2
+ %t0 = bitcast float* %add.ptr to <4 x float>*
+ %t1 = load <4 x float>, <4 x float>* %t0, align 16
+ %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1)
+ ret <8 x float> %t2
}
; rdar://11076953
-; CHECK-LABEL: vinsertf128_ucombine:
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
-; CHECK-NOT: vmovups
-; CHECK: vinsertf128
-entry:
+; CHECK-LABEL: vinsertf128_ucombine:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
- %0 = bitcast float* %add.ptr to <4 x float>*
- %1 = load <4 x float>, <4 x float>* %0, align 8
- %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1)
- ret <8 x float> %2
+ %t0 = bitcast float* %add.ptr to <4 x float>*
+ %t1 = load <4 x float>, <4 x float>* %t0, align 8
+ %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1)
+ ret <8 x float> %t2
}
+
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 971d03af3778a..318c9cfd8a3fc 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -633,13 +633,13 @@ entry:
define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V111:
; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: vpbroadcastd LCPI29_0, %ymm1
+; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V111:
; X64-AVX2: ## BB#0: ## %entry
-; X64-AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -660,13 +660,13 @@ entry:
define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V113:
; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: vbroadcastss LCPI30_0, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V113:
; X64-AVX2: ## BB#0: ## %entry
-; X64-AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -687,12 +687,12 @@ entry:
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
; X32: ## BB#0:
-; X32-NEXT: vbroadcastss LCPI31_0, %xmm0
+; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## BB#0:
-; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm0
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-NEXT: retq
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index eae7b94f5135c..b5a13404a2304 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -14,6 +14,7 @@ define double @test1(double %a, double %b) nounwind {
; ALL-NEXT: LBB0_2: ## %l2
; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp une double %a, %b
br i1 %tobool, label %l1, label %l2
@@ -36,6 +37,7 @@ define float @test2(float %a, float %b) nounwind {
; ALL-NEXT: LBB1_2: ## %l2
; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp olt float %a, %b
br i1 %tobool, label %l1, label %l2
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 29a5325a0ae98..f858e7eb792fe 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -12,6 +12,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test1:
; SKX: ## BB#0:
@@ -21,6 +22,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; SKX-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -36,6 +38,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
@@ -45,6 +48,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -58,6 +62,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test3:
; SKX: ## BB#0:
@@ -65,6 +70,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -78,6 +84,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
@@ -86,6 +93,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
@@ -96,11 +104,13 @@ define i32 @test5(<4 x float> %x) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, %eax
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test5:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, %eax
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
@@ -111,11 +121,13 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test6:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
@@ -135,6 +147,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test7:
; SKX: ## BB#0:
@@ -150,6 +163,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
@@ -168,6 +182,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test8:
; SKX: ## BB#0:
@@ -183,6 +198,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
@@ -201,6 +217,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
@@ -216,6 +233,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
@@ -234,6 +252,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
@@ -249,6 +268,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
@@ -1293,7 +1313,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -1457,7 +1477,7 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-LABEL: test_extractelement_v4i1:
; KNL: ## BB#0:
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -2326,7 +2346,7 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## BB#0:
; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 2b04b9229b3d2..b3fbceea80a94 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -8,6 +8,7 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
@@ -19,6 +20,7 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
@@ -30,6 +32,7 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -42,6 +45,7 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -53,6 +57,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
@@ -64,6 +69,7 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -117,12 +123,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -137,12 +145,14 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -154,6 +164,7 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -168,6 +179,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12:
; SKX: ## BB#0:
@@ -178,6 +190,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -330,6 +343,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
@@ -339,6 +353,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
ret i32 %res1
@@ -642,6 +657,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
@@ -651,6 +667,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
ret i64 %res1
@@ -704,6 +721,7 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -715,6 +733,7 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -727,6 +746,7 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -739,6 +759,7 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -752,6 +773,7 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -766,6 +788,7 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -780,6 +803,7 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -795,6 +819,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -809,6 +834,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -823,6 +849,7 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -838,6 +865,7 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -855,6 +883,7 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -920,12 +949,14 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -938,12 +969,14 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -957,12 +990,14 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -976,6 +1011,7 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -988,12 +1024,14 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1010,12 +1048,14 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -1029,6 +1069,7 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1041,6 +1082,7 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1058,12 +1100,14 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
@@ -1081,12 +1125,14 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1104,6 +1150,7 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
@@ -1124,12 +1171,14 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
@@ -1147,12 +1196,14 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
@@ -1172,6 +1223,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test43:
; SKX: ## BB#0:
@@ -1180,6 +1232,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index e0acf2be653e2..43b1f53a09fae 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -1,56 +1,98 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: test256_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_1:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_1:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
ret <4 x i64> %max
}
define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
-; CHECK-LABEL: test256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_2:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_2:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
-; CHECK-LABEL: test256_3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
-; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_3:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k1
+; VLX-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_3:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; NoVLX-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
ret <8 x i32> %max
}
define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
-; CHECK-LABEL: test256_4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_4:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_4:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_5:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_5:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -58,11 +100,21 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin
}
define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_5b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_5b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_5b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqd %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -70,11 +122,21 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_6:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_6:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -82,11 +144,21 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_6b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_6b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_6b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp slt <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -94,11 +166,21 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_7:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_7:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -106,11 +188,21 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_7b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_7b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_7b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -118,11 +210,21 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_8:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_8:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -130,11 +232,21 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_8b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_8b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_8b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp uge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -142,12 +254,25 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_9:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_9:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; VLX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_9:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -156,12 +281,22 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
}
define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_10:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_10:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_10:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpandn %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -170,12 +305,20 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
}
define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_11:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_11:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_11:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3
+; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <4 x i64> %x, %y
@@ -185,12 +328,25 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
}
define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_12:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_12:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask0 = icmp ule <8 x i32> %x, %y
@@ -200,11 +356,18 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
}
define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
-; CHECK-LABEL: test256_13:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_13:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_13:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
%y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -214,11 +377,21 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
}
define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
-; CHECK-LABEL: test256_14:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_14:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_14:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
%y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -228,12 +401,25 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
}
define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_15:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_15:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_15:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -245,12 +431,21 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
}
define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_16:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_16:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm3
+; NoVLX-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -262,11 +457,21 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
}
define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_17:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_17:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_17:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpneqd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -274,11 +479,21 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_18:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_18:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_18:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpneqd %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -286,11 +501,21 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_19:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_19:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_19:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -298,11 +523,21 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_20:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_20:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_20:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -310,55 +545,90 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
-; CHECK-LABEL: test128_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_1:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_1:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
ret <2 x i64> %max
}
define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
-; CHECK-LABEL: test128_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_2:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_2:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
-; CHECK-LABEL: test128_3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
-; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_3:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k1
+; VLX-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_3:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
ret <4 x i32> %max
}
define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
-; CHECK-LABEL: test128_4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_4:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_4:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
-; CHECK-LABEL: test128_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_5:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_5:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -366,11 +636,17 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin
}
define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
-; CHECK-LABEL: test128_5b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_5b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_5b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -378,11 +654,17 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi
}
define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_6:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_6:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -390,11 +672,17 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_6b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_6b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_6b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp slt <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -402,11 +690,19 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_7:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_7:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -414,11 +710,19 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_7b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_7b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_7b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -426,11 +730,18 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_8:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_8:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -438,11 +749,19 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_8b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_8b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_8b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
+; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -450,12 +769,20 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_9:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_9:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; VLX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_9:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -464,12 +791,22 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
}
define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_10:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_10:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_10:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; NoVLX-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
%mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
@@ -478,12 +815,20 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
}
define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_11:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_11:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_11:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <2 x i64> %x, %y
@@ -493,12 +838,21 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
}
define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_12:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_12:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask0 = icmp ule <4 x i32> %x, %y
@@ -508,11 +862,18 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
}
define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
-; CHECK-LABEL: test128_13:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_13:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_13:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
%y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
@@ -522,11 +883,20 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
}
define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
-; CHECK-LABEL: test128_14:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_14:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_14:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
%y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -536,12 +906,21 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
}
define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_15:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_15:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_15:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm3
+; NoVLX-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -553,12 +932,21 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
}
define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_16:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_16:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
@@ -570,11 +958,19 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64
}
define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_17:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_17:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_17:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -582,11 +978,19 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_18:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_18:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_18:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -594,11 +998,18 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_19:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_19:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_19:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpmaxud (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -606,11 +1017,19 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_20:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_20:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_20:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
+; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index f297fc3db95fa..4d3a1495617ea 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -1,13 +1,124 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi0:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi3:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi4:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi5:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi6:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi7:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -18,11 +129,122 @@ entry:
}
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi8:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi9:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi10:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi11:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi12:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi13:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi14:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi15:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -34,12 +256,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi16:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi17:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi18:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi19:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi20:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi21:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi22:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi23:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -52,12 +386,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi24:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi25:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi26:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi27:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi28:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi29:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi30:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi31:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -72,11 +518,127 @@ entry:
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi32:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi33:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi34:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi35:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi36:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi37:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi38:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi39:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -87,11 +649,127 @@ entry:
}
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi40:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi41:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi42:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi43:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi44:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi45:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi46:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi47:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -103,12 +781,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi48:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi49:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi50:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi51:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi52:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi53:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi54:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi55:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -121,12 +916,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi56:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi57:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi58:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi59:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi60:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi61:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi62:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi63:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -141,12 +1053,46 @@ entry:
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi64:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi65:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi66:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -157,12 +1103,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi67:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi68:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi69:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -174,13 +1154,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi70:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi71:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi72:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -193,13 +1216,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi73:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi74:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi75:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -214,11 +1280,24 @@ entry:
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -229,11 +1308,24 @@ entry:
}
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -245,12 +1337,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -263,12 +1369,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -283,11 +1403,72 @@ entry:
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi76:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi77:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi78:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -298,11 +1479,72 @@ entry:
}
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi79:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi80:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi81:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -314,12 +1556,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi82:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi83:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi84:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -332,12 +1636,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi85:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi86:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi87:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -352,11 +1718,77 @@ entry:
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi88:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi89:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi90:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -367,11 +1799,77 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi91:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi92:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi93:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -383,12 +1881,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi94:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi95:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi96:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -401,12 +1966,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi97:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi98:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi99:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -421,12 +2053,123 @@ entry:
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi100:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi101:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi102:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi103:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi104:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi105:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi106:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi107:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -437,12 +2180,123 @@ entry:
}
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi108:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi109:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi110:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi111:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi112:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi113:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi114:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi115:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -454,13 +2308,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi116:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi117:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi118:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi119:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi120:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi121:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi122:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi123:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -473,13 +2439,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi124:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi125:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi126:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi127:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi128:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi129:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi130:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi131:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -494,12 +2572,128 @@ entry:
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi132:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi133:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi134:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi135:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi136:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi137:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi138:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi139:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -510,12 +2704,128 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi140:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi141:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi142:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi143:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi144:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi145:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi146:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi147:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -527,13 +2837,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi148:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi149:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi150:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi151:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi152:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi153:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi154:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi155:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -546,13 +2973,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi156:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi157:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi158:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi159:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi160:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi161:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi162:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi163:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -567,12 +3111,348 @@ entry:
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi164:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi165:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi166:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -583,12 +3463,263 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi167:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi168:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi169:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -600,13 +3731,358 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi170:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi171:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi172:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -619,13 +4095,273 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi173:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi174:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi175:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -640,11 +4376,51 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -655,11 +4431,51 @@ entry:
}
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -671,12 +4487,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -690,12 +4564,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -711,11 +4643,52 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -728,12 +4701,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -750,11 +4782,50 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -765,11 +4836,50 @@ entry:
}
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -781,12 +4891,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -800,12 +4967,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -821,11 +5045,51 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -838,12 +5102,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -860,11 +5182,39 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi176:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi177:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi178:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -875,11 +5225,39 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi179:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi180:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi181:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -891,12 +5269,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi182:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi183:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi184:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -910,12 +5334,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi185:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi186:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi187:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -931,11 +5401,40 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi188:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi189:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi190:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -948,12 +5447,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi191:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi192:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi193:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -970,11 +5516,46 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi194:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi195:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi196:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -985,11 +5566,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi197:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi198:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi199:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1001,12 +5617,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi200:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi201:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi202:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -1020,12 +5689,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi203:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi204:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi205:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1041,11 +5763,47 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi206:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi207:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi208:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -1058,12 +5816,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi209:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi210:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi211:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -1080,21 +5892,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1106,21 +5920,23 @@ entry:
}
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1133,23 +5949,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1163,23 +5981,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1195,21 +6015,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1223,23 +6045,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1256,12 +6080,72 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi212:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi213:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi214:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1272,12 +6156,72 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi215:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi216:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi217:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1289,13 +6233,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi218:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi219:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi220:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1308,13 +6314,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi221:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi222:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi223:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1329,12 +6397,72 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi224:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi225:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi226:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1347,13 +6475,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi227:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi228:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi229:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1369,12 +6559,77 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi230:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi231:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi232:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1385,12 +6640,77 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi233:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi234:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi235:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1402,13 +6722,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi236:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi237:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi238:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1421,13 +6808,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi239:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi240:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi241:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1442,12 +6896,77 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi242:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi243:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi244:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1460,13 +6979,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi245:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi246:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi247:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1482,12 +7068,120 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi248:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi249:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi250:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi251:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi252:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi253:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi254:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi255:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1498,12 +7192,120 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi256:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi257:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi258:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi259:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi260:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi261:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi262:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi263:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1515,13 +7317,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi264:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi265:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi266:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi267:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi268:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi269:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi270:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi271:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1534,13 +7445,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi272:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi273:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi274:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi275:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi276:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi277:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi278:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi279:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1555,12 +7575,120 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi280:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi281:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi282:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi283:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi284:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi285:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi286:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi287:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1573,13 +7701,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi288:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi289:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi290:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi291:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi292:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi293:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi294:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi295:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1595,12 +7832,125 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi296:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi297:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi298:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi299:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi300:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi301:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi302:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi303:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1611,12 +7961,125 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi304:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi305:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi306:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi307:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi308:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi309:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi310:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi311:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1628,13 +8091,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi312:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi313:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi314:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi315:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi316:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi317:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi318:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi319:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1647,13 +8224,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi320:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi321:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi322:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi323:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi324:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi325:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi326:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi327:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1668,12 +8359,125 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi328:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi329:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi330:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi331:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi332:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi333:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi334:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi335:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1686,13 +8490,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi336:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi337:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi338:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi339:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi340:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi341:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi342:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi343:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1708,12 +8626,23 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1724,12 +8653,23 @@ entry:
}
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1741,13 +8681,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1761,13 +8722,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1783,12 +8765,24 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1801,13 +8795,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1824,11 +8840,35 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1839,11 +8879,35 @@ entry:
}
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1855,12 +8919,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1874,12 +8972,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1895,11 +9027,36 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1912,12 +9069,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1934,11 +9126,34 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1949,11 +9164,34 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1965,12 +9203,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1984,12 +9255,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2005,11 +9309,35 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2022,12 +9350,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2044,11 +9406,39 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi344:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi345:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi346:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2059,11 +9449,39 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi347:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi348:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi349:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2075,12 +9493,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi350:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi351:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi352:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2094,12 +9550,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi353:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi354:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi355:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2115,11 +9609,40 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi356:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi357:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi358:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2132,12 +9655,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi359:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi360:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi361:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2154,11 +9716,46 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi362:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi363:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi364:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2169,11 +9766,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi365:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi366:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi367:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2185,12 +9817,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi368:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi369:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi370:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2204,12 +9881,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi371:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi372:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi373:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2225,11 +9947,47 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi374:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi375:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi376:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2242,12 +10000,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi377:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi378:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi379:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2264,12 +10068,53 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2280,12 +10125,53 @@ entry:
}
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2297,13 +10183,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2317,13 +10262,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2339,12 +10343,54 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2357,13 +10403,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2380,12 +10486,52 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2396,12 +10542,52 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2413,13 +10599,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2433,13 +10677,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2455,12 +10757,53 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2473,13 +10816,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2496,12 +10898,41 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi380:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi381:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi382:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2512,12 +10943,41 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi383:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi384:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi385:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2529,13 +10989,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi386:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi387:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi388:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2549,13 +11056,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi389:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi390:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi391:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2571,12 +11125,42 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi392:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi393:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi394:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2589,13 +11173,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi395:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi396:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi397:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2612,12 +11244,48 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi398:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi399:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi400:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2628,12 +11296,48 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi401:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi402:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi403:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2645,13 +11349,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi404:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi405:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi406:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2665,13 +11423,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi407:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi408:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi409:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2687,12 +11499,49 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi410:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi411:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi412:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2705,13 +11554,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi413:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi414:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi415:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2728,12 +11632,20 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2744,12 +11656,20 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2761,13 +11681,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2780,13 +11709,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2801,12 +11739,20 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2819,13 +11765,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2841,12 +11796,70 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi416:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi417:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi418:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2857,12 +11870,70 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi419:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi420:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi421:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2874,13 +11945,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi422:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi423:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi424:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2893,13 +12023,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi425:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi426:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi427:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2914,12 +12103,70 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi428:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi429:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi430:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2932,13 +12179,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi431:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi432:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi433:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2954,12 +12260,75 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi434:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi435:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi436:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2970,12 +12339,75 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi437:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi438:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi439:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2987,13 +12419,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi440:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi441:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi442:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -3006,13 +12502,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi443:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi444:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi445:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3027,12 +12587,75 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi446:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi447:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi448:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -3045,13 +12668,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi449:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi450:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi451:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -3067,11 +12754,122 @@ entry:
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi452:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi453:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi454:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi455:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi456:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi457:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi458:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi459:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3082,11 +12880,122 @@ entry:
}
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi460:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi461:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi462:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi463:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi464:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi465:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi466:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi467:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3098,12 +13007,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi468:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi469:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi470:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi471:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi472:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi473:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi474:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi475:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3116,12 +13137,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi476:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi477:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi478:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi479:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi480:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi481:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi482:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi483:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3136,11 +13269,127 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi484:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi485:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi486:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi487:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi488:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi489:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi490:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi491:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3151,11 +13400,127 @@ entry:
}
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi492:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi493:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi494:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi495:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi496:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi497:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi498:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi499:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3167,12 +13532,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi500:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi501:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi502:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi503:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi504:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi505:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi506:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi507:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3185,12 +13667,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi508:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi509:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi510:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi511:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi512:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi513:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi514:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi515:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3205,12 +13804,46 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi516:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi517:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi518:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3221,12 +13854,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi519:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi520:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi521:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3238,13 +13905,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi522:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi523:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi524:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3257,13 +13967,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi525:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi526:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi527:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3278,11 +14031,24 @@ entry:
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3293,11 +14059,24 @@ entry:
}
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3309,12 +14088,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3327,12 +14120,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3347,11 +14154,72 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi528:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi529:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi530:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3362,11 +14230,72 @@ entry:
}
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi531:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi532:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi533:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3378,12 +14307,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi534:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi535:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi536:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3396,12 +14387,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi537:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi538:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi539:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3416,11 +14469,77 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi540:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi541:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi542:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3431,11 +14550,77 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi543:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi544:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi545:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3447,12 +14632,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi546:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi547:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi548:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3465,12 +14717,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi549:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi550:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi551:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3485,12 +14804,123 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi552:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi553:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi554:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi555:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi556:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi557:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi558:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi559:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3501,12 +14931,123 @@ entry:
}
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi560:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi561:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi562:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi563:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi564:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi565:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi566:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi567:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3518,13 +15059,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi568:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi569:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi570:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi571:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi572:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi573:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi574:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi575:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3537,13 +15190,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi576:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi577:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi578:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi579:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi580:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi581:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi582:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi583:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3558,12 +15323,128 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi584:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi585:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi586:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi587:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi588:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi589:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi590:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi591:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3574,12 +15455,128 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi592:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi593:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi594:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi595:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi596:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi597:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi598:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi599:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3591,13 +15588,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi600:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi601:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi602:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi603:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi604:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi605:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi606:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi607:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3610,13 +15724,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi608:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi609:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi610:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi611:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi612:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi613:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi614:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi615:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3631,12 +15862,348 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi616:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi617:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi618:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3647,12 +16214,263 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi619:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi620:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi621:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3664,13 +16482,358 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi622:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi623:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi624:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3683,13 +16846,273 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi625:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi626:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi627:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3704,11 +17127,51 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3719,11 +17182,51 @@ entry:
}
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3735,12 +17238,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3754,12 +17315,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3775,11 +17394,52 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3792,12 +17452,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3814,11 +17533,50 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3829,11 +17587,50 @@ entry:
}
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3845,12 +17642,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3864,12 +17718,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3885,11 +17796,51 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3902,12 +17853,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3924,11 +17933,39 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi628:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi629:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi630:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3939,11 +17976,39 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi631:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi632:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi633:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3955,12 +18020,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi634:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi635:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi636:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3974,12 +18085,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi637:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi638:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi639:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3995,11 +18152,40 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi640:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi641:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi642:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4012,12 +18198,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi643:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi644:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi645:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4034,11 +18267,46 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi646:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi647:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi648:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4049,11 +18317,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi649:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi650:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi651:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4065,12 +18368,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi652:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi653:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi654:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4084,12 +18440,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi655:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi656:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi657:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4105,11 +18514,47 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi658:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi659:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi660:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4122,12 +18567,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi661:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi662:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi663:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4144,21 +18643,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4170,21 +18671,23 @@ entry:
}
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4197,23 +18700,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4227,23 +18732,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4259,21 +18766,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4287,23 +18796,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4320,12 +18831,72 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi664:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi665:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi666:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4336,12 +18907,72 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi667:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi668:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi669:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4353,13 +18984,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi670:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi671:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi672:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4372,13 +19065,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi673:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi674:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi675:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4393,12 +19148,72 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi676:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi677:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi678:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4411,13 +19226,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi679:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi680:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi681:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4433,12 +19310,77 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi682:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi683:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi684:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4449,12 +19391,77 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi685:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi686:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi687:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4466,13 +19473,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi688:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi689:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi690:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4485,13 +19559,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi691:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi692:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi693:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4506,12 +19647,77 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi694:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi695:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi696:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4524,13 +19730,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi697:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi698:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi699:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4546,12 +19819,120 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi700:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi701:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi702:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi703:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi704:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi705:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi706:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi707:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4562,12 +19943,120 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi708:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi709:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi710:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi711:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi712:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi713:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi714:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi715:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4579,13 +20068,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi716:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi717:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi718:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi719:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi720:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi721:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi722:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi723:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4598,13 +20196,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi724:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi725:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi726:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi727:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi728:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi729:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi730:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi731:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4619,12 +20326,120 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi732:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi733:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi734:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi735:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi736:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi737:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi738:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi739:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4637,13 +20452,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi740:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi741:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi742:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi743:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi744:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi745:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi746:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi747:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4659,12 +20583,125 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi748:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi749:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi750:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi751:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi752:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi753:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi754:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi755:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4675,12 +20712,125 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi756:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi757:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi758:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi759:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi760:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi761:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi762:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi763:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4692,13 +20842,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi764:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi765:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi766:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi767:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi768:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi769:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi770:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi771:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4711,13 +20975,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi772:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi773:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi774:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi775:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi776:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi777:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi778:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi779:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4732,12 +21110,125 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi780:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi781:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi782:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi783:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi784:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi785:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi786:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi787:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4750,13 +21241,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi788:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi789:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi790:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi791:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi792:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi793:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi794:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi795:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4772,12 +21377,23 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4788,12 +21404,23 @@ entry:
}
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4805,13 +21432,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4825,13 +21473,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4847,12 +21516,24 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4865,13 +21546,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4888,11 +21591,35 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4903,11 +21630,35 @@ entry:
}
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4919,12 +21670,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4938,12 +21723,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4959,11 +21778,36 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4976,12 +21820,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4998,11 +21877,34 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5013,11 +21915,34 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5029,12 +21954,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5048,12 +22006,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5069,11 +22060,35 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5086,12 +22101,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5108,11 +22157,39 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi796:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi797:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi798:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5123,11 +22200,39 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi799:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi800:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi801:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5139,12 +22244,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi802:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi803:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi804:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5158,12 +22301,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi805:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi806:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi807:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5179,11 +22360,40 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi808:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi809:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi810:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5196,12 +22406,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi811:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi812:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi813:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5218,11 +22467,46 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi814:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi815:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi816:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5233,11 +22517,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi817:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi818:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi819:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5249,12 +22568,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi820:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi821:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi822:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5268,12 +22632,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi823:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi824:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi825:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5289,11 +22698,47 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi826:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi827:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi828:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5306,12 +22751,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi829:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi830:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi831:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5328,12 +22819,53 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5344,12 +22876,53 @@ entry:
}
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5361,13 +22934,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5381,13 +23013,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5403,12 +23094,54 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5421,13 +23154,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5444,12 +23237,52 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5460,12 +23293,52 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5477,13 +23350,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5497,13 +23428,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5519,12 +23508,53 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5537,13 +23567,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5560,12 +23649,41 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi832:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi833:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi834:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5576,12 +23694,41 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi835:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi836:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi837:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5593,13 +23740,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi838:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi839:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi840:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5613,13 +23807,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi841:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi842:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi843:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5635,12 +23876,42 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi844:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi845:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi846:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5653,13 +23924,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi847:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi848:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi849:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5676,12 +23995,48 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi850:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi851:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi852:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5692,12 +24047,48 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi853:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi854:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi855:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5709,13 +24100,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi856:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi857:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi858:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5729,13 +24174,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi859:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi860:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi861:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5751,12 +24250,49 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi862:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi863:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi864:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5769,13 +24305,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi865:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi866:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi867:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5792,12 +24383,20 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5808,12 +24407,20 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5825,13 +24432,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5844,13 +24460,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5865,12 +24490,20 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5883,13 +24516,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5905,12 +24547,70 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi868:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi869:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi870:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5921,12 +24621,70 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi871:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi872:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi873:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5938,13 +24696,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi874:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi875:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi876:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5957,13 +24774,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi877:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi878:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi879:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5978,12 +24854,70 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi880:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi881:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi882:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5996,13 +24930,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi883:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi884:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi885:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6018,12 +25011,75 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi886:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi887:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi888:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -6034,12 +25090,75 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi889:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi890:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi891:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6051,13 +25170,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi892:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi893:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi894:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -6070,13 +25253,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi895:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi896:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi897:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6091,12 +25338,75 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi898:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi899:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi900:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6109,13 +25419,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi901:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi902:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi903:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6131,11 +25505,124 @@ entry:
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi904:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi905:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi906:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi907:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi908:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi909:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi910:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi911:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6146,11 +25633,125 @@ entry:
}
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi912:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi913:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi914:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi915:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi916:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi917:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi918:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi919:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6162,12 +25763,126 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi920:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi921:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi922:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi923:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi924:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi925:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi926:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi927:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6180,12 +25895,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi928:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi929:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi930:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi931:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi932:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi933:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi934:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi935:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6200,11 +26030,129 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi936:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi937:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi938:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi939:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi940:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi941:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi942:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi943:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6215,11 +26163,130 @@ entry:
}
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi944:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi945:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi946:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi947:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi948:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi949:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi950:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi951:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6231,12 +26298,131 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi952:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi953:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi954:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi955:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi956:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi957:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi958:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi959:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6249,12 +26435,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi960:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi961:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi962:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi963:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi964:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi965:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi966:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi967:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6269,12 +26575,48 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi968:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi969:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi970:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -6285,12 +26627,49 @@ entry:
}
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi971:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi972:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi973:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6302,13 +26681,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi974:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi975:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi976:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -6321,13 +26745,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi977:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi978:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi979:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6342,11 +26812,26 @@ entry:
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6357,11 +26842,27 @@ entry:
}
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6373,12 +26874,28 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6391,12 +26908,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6411,11 +26945,74 @@ entry:
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi980:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi981:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi982:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6426,11 +27023,75 @@ entry:
}
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi983:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi984:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi985:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6442,12 +27103,76 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi986:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi987:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi988:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6460,12 +27185,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi989:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi990:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi991:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6480,11 +27270,79 @@ entry:
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi992:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi993:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi994:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6495,11 +27353,80 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi995:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi996:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi997:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6511,12 +27438,81 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi998:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi999:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1000:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6529,12 +27525,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1001:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1002:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1003:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6549,12 +27615,125 @@ entry:
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1004:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1005:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1006:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1007:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1008:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1009:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1010:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1011:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6565,12 +27744,126 @@ entry:
}
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1012:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1013:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1014:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1015:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1016:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1017:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1018:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1019:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6582,13 +27875,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1020:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1021:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1022:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1023:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1024:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1025:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1026:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1027:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6601,13 +28008,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1028:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1029:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1030:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1031:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1032:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1033:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1034:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1035:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6622,12 +28144,130 @@ entry:
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1036:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1037:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1038:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1039:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1040:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1041:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1042:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1043:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6638,12 +28278,131 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1044:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1045:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1046:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1047:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1048:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1049:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1050:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1051:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6655,13 +28414,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1052:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1053:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1054:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1055:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1056:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1057:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1058:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1059:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6674,13 +28552,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1060:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1061:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1062:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1063:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1064:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1065:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1066:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1067:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6695,12 +28693,351 @@ entry:
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1068:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1069:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1070:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -6711,12 +29048,268 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1071:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1072:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1073:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6728,13 +29321,361 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1074:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1075:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1076:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -6747,13 +29688,278 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1077:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1078:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1079:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3
+; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
+; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6768,11 +29974,53 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6783,11 +30031,54 @@ entry:
}
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6799,12 +30090,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6818,12 +30167,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6839,12 +30247,55 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6857,13 +30308,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6880,11 +30390,52 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6895,11 +30446,53 @@ entry:
}
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6911,12 +30504,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6930,12 +30580,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6951,12 +30659,54 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6969,13 +30719,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6992,11 +30800,41 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1080:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1081:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1082:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7007,11 +30845,42 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1083:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1084:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1085:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7023,12 +30892,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1086:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1087:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1088:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7042,12 +30957,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1089:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1090:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1091:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7063,12 +31025,43 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1092:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1093:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1094:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7081,13 +31074,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1095:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1096:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1097:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7104,11 +31144,48 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1098:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1099:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1100:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7119,11 +31196,49 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1101:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1102:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1103:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7135,12 +31250,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1104:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1105:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1106:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7154,12 +31322,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1107:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1108:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1109:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7175,12 +31397,50 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1110:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1111:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1112:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7193,13 +31453,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1113:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1114:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1115:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7216,21 +31530,23 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7242,21 +31558,23 @@ entry:
}
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7269,23 +31587,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7299,23 +31619,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7331,22 +31653,24 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7360,24 +31684,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7394,12 +31720,72 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1116:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1117:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1118:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7410,12 +31796,72 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1119:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1120:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1121:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7427,13 +31873,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1122:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1123:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1124:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7446,13 +31954,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1125:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1126:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1127:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7467,13 +32037,73 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1128:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1129:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1130:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7486,14 +32116,76 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1131:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1132:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1133:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7509,12 +32201,77 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1134:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1135:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1136:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7525,12 +32282,77 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1137:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1138:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1139:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7542,13 +32364,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1140:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1141:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1142:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7561,13 +32450,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1143:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1144:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1145:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7582,13 +32538,78 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1146:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1147:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1148:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7601,14 +32622,81 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1149:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1150:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1151:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7624,12 +32712,120 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1152:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1153:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1154:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1155:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1156:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1157:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1158:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1159:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7640,12 +32836,120 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1160:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1161:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1162:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1163:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1164:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1165:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1166:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1167:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7657,13 +32961,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1168:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1169:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1170:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1171:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1172:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1173:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1174:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1175:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7676,13 +33089,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1176:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1177:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1178:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1179:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1180:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1181:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1182:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1183:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7697,13 +33219,122 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1184:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1185:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1186:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1187:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1188:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1189:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1190:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1191:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7716,14 +33347,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1192:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1193:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1194:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1195:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1196:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1197:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1198:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1199:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7739,12 +33480,125 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1200:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1201:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1202:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1203:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1204:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1205:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1206:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1207:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7755,12 +33609,125 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1208:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1209:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1210:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1211:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1212:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1213:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1214:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1215:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7772,13 +33739,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1216:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1217:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1218:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1219:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1220:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1221:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1222:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1223:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7791,13 +33872,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1224:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1225:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1226:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1227:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1228:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1229:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1230:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1231:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7812,13 +34007,127 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1232:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1233:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1234:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1235:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1236:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1237:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1238:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1239:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7831,14 +34140,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1240:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1241:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1242:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1243:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1244:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1245:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1246:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1247:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7854,12 +34278,25 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7870,12 +34307,26 @@ entry:
}
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7887,13 +34338,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7907,13 +34379,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7929,13 +34423,27 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -7948,14 +34456,36 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -7972,11 +34502,37 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7987,11 +34543,38 @@ entry:
}
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8003,12 +34586,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8022,12 +34639,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8043,12 +34695,39 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8061,13 +34740,48 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8084,11 +34798,36 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8099,11 +34838,37 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8115,12 +34880,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8134,12 +34932,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8155,12 +34987,38 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8173,13 +35031,47 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8196,11 +35088,41 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1248:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1249:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1250:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8211,11 +35133,42 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1251:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1252:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1253:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8227,12 +35180,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1254:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1255:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1256:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8246,12 +35237,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1257:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1258:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1259:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8267,12 +35297,43 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1260:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1261:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1262:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8285,13 +35346,52 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1263:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1264:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1265:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8308,11 +35408,48 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1266:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1267:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1268:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8323,11 +35460,49 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1269:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1270:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1271:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8339,12 +35514,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1272:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1273:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1274:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8358,12 +35578,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1275:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1276:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1277:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8379,12 +35645,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1278:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1279:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1280:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8397,13 +35701,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1281:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1282:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1283:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8420,12 +35770,55 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8436,12 +35829,56 @@ entry:
}
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8453,13 +35890,74 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8473,13 +35971,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8495,13 +36055,57 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8514,14 +36118,76 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8538,12 +36204,54 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8554,12 +36262,55 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8571,13 +36322,73 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8591,13 +36402,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8613,13 +36485,56 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8632,14 +36547,75 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8656,12 +36632,43 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1284:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1285:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1286:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8672,12 +36679,44 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1287:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1288:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1289:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8689,13 +36728,62 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1290:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1291:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1292:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8709,13 +36797,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1293:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1294:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1295:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8731,13 +36869,45 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1296:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1297:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1298:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8750,14 +36920,64 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1299:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1300:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1301:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8774,12 +36994,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1302:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1303:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1304:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8790,12 +37048,51 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1305:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1306:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1307:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8807,13 +37104,69 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1308:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1309:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1310:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8827,13 +37180,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1311:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1312:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1313:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8849,13 +37259,52 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1314:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1315:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1316:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8868,14 +37317,71 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1317:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1318:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1319:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8892,12 +37398,20 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -8908,12 +37422,20 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -8925,13 +37447,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -8944,13 +37475,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -8965,13 +37505,22 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -8984,14 +37533,24 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9007,12 +37566,70 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1320:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1321:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1322:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9023,12 +37640,70 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1323:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1324:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1325:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9040,13 +37715,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1326:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1327:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1328:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9059,13 +37793,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1329:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1330:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1331:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9080,13 +37873,72 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1332:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1333:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1334:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9099,14 +37951,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1335:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1336:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1337:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9122,12 +38034,75 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1338:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1339:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1340:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9138,12 +38113,75 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1341:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1342:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1343:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9155,13 +38193,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1344:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1345:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1346:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9174,13 +38276,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1347:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1348:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1349:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9195,13 +38361,77 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1350:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1351:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1352:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9214,14 +38444,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1353:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1354:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1355:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9237,11 +38532,125 @@ entry:
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1356:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1357:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1358:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1359:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1360:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1361:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1362:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1363:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9252,11 +38661,125 @@ entry:
}
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1364:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1365:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1366:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1367:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1368:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1369:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1370:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1371:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9268,12 +38791,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1372:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1373:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1374:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1375:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1376:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1377:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1378:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1379:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9286,12 +38924,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1380:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1381:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1382:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1383:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1384:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1385:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1386:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1387:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9306,11 +39059,130 @@ entry:
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1388:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1389:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1390:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1391:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1392:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1393:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1394:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1395:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9321,11 +39193,130 @@ entry:
}
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1396:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1397:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1398:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1399:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1400:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1401:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1402:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1403:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9337,12 +39328,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1404:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1405:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1406:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1407:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1408:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1409:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1410:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1411:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9355,12 +39466,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1412:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1413:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1414:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1415:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1416:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1417:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1418:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1419:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9375,12 +39606,49 @@ entry:
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1420:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1421:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1422:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -9391,12 +39659,49 @@ entry:
}
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1423:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1424:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1425:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9408,13 +39713,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1426:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1427:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1428:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -9427,13 +39778,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1429:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1430:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1431:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9448,11 +39845,27 @@ entry:
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9463,11 +39876,27 @@ entry:
}
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9479,12 +39908,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9497,12 +39943,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9517,11 +39980,75 @@ entry:
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1432:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1433:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1434:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9532,11 +40059,75 @@ entry:
}
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1435:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1436:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1437:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9548,12 +40139,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1438:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1439:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1440:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9566,12 +40222,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1441:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1442:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1443:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9586,11 +40307,80 @@ entry:
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1444:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1445:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1446:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9601,11 +40391,80 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1447:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1448:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1449:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9617,12 +40476,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1450:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1451:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1452:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9635,12 +40564,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1453:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1454:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1455:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9655,12 +40654,126 @@ entry:
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1456:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1457:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1458:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1459:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1460:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1461:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1462:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1463:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9671,12 +40784,126 @@ entry:
}
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1464:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1465:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1466:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1467:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1468:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1469:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1470:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1471:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9688,13 +40915,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1472:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1473:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1474:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1475:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1476:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1477:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1478:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1479:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9707,13 +41049,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1480:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1481:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1482:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1483:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1484:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1485:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1486:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1487:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9728,12 +41185,131 @@ entry:
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1488:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1489:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1490:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1491:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1492:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1493:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1494:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1495:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9744,12 +41320,131 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1496:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1497:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1498:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1499:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1500:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1501:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1502:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1503:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9761,13 +41456,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1504:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1505:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1506:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1507:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1508:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1509:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1510:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1511:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9780,13 +41595,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1512:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1513:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1514:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1515:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1516:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1517:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1518:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1519:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9801,12 +41736,353 @@ entry:
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1520:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1521:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1522:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -9817,12 +42093,268 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1523:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1524:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1525:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9834,13 +42366,363 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1526:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1527:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1528:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm7
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3
+; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4
+; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2
+; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -9853,13 +42735,278 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1529:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1530:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1531:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9874,11 +43021,54 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9889,11 +43079,54 @@ entry:
}
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9905,12 +43138,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9924,12 +43218,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9945,11 +43300,55 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -9962,12 +43361,74 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -9984,11 +43445,53 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9999,11 +43502,53 @@ entry:
}
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10015,12 +43560,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10034,12 +43639,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10055,11 +43720,54 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10072,12 +43780,73 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10094,11 +43863,42 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1532:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1533:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1534:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10109,11 +43909,42 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1535:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1536:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1537:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10125,12 +43956,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1538:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1539:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1540:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10144,12 +44024,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1541:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1542:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1543:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10165,11 +44094,43 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1544:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1545:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1546:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10182,12 +44143,62 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1547:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1548:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1549:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10204,11 +44215,49 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1550:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1551:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1552:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10219,11 +44268,49 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1553:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1554:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1555:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10235,12 +44322,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1556:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1557:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1558:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10254,12 +44397,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1559:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1560:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1561:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10275,11 +44474,50 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1562:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1563:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1564:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10292,12 +44530,69 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1565:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1566:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1567:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10314,21 +44609,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10340,21 +44637,23 @@ entry:
}
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10367,23 +44666,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10397,23 +44698,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10429,21 +44732,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10457,23 +44762,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10490,12 +44797,72 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1568:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1569:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1570:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10506,12 +44873,72 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1571:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1572:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1573:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10523,13 +44950,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1574:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1575:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1576:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10542,13 +45031,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1577:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1578:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1579:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10563,12 +45114,72 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1580:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1581:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1582:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10581,13 +45192,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1583:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1584:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1585:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10603,12 +45276,77 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1586:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1587:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1588:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10619,12 +45357,77 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1589:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1590:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1591:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10636,13 +45439,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1592:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1593:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1594:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10655,13 +45525,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1595:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1596:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1597:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10676,12 +45613,77 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1598:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1599:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1600:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10694,13 +45696,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1601:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1602:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1603:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10716,12 +45785,120 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1604:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1605:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1606:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1607:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1608:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1609:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1610:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1611:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10732,12 +45909,120 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1612:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1613:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1614:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1615:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1616:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1617:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1618:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1619:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10749,13 +46034,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1620:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1621:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1622:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1623:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1624:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1625:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1626:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1627:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10768,13 +46162,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1628:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1629:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1630:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1631:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1632:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1633:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1634:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1635:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10789,12 +46292,120 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1636:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1637:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1638:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1639:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1640:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1641:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1642:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1643:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10807,13 +46418,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1644:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1645:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1646:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1647:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1648:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1649:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1650:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1651:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10829,12 +46549,125 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1652:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1653:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1654:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1655:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1656:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1657:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1658:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1659:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10845,12 +46678,125 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1660:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1661:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1662:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1663:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1664:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1665:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1666:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1667:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10862,13 +46808,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1668:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1669:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1670:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1671:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1672:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1673:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1674:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1675:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10881,13 +46941,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1676:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1677:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1678:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1679:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1680:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1681:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1682:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1683:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10902,12 +47076,125 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1684:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1685:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1686:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1687:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1688:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1689:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1690:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1691:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10920,13 +47207,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1692:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1693:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1694:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1695:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1696:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1697:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1698:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1699:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10942,12 +47343,26 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -10958,12 +47373,26 @@ entry:
}
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10975,13 +47404,37 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -10995,13 +47448,37 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11017,12 +47494,27 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11035,13 +47527,38 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11058,11 +47575,38 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11073,11 +47617,38 @@ entry:
}
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11089,12 +47660,49 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11108,12 +47716,49 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11129,11 +47774,39 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11146,12 +47819,50 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11168,11 +47879,37 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11183,11 +47920,37 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11199,12 +47962,48 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11218,12 +48017,48 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11239,11 +48074,38 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11256,12 +48118,49 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11278,11 +48177,42 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1700:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1701:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1702:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11293,11 +48223,42 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1703:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1704:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1705:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11309,12 +48270,53 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1706:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1707:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1708:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11328,12 +48330,53 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1709:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1710:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1711:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11349,11 +48392,43 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1712:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1713:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1714:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11366,12 +48441,54 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1715:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1716:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1717:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11388,11 +48505,49 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1718:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1719:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1720:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11403,11 +48558,49 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1721:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1722:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1723:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11419,12 +48612,60 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1724:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1725:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1726:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11438,12 +48679,60 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1727:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1728:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1729:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11459,11 +48748,50 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1730:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1731:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1732:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11476,12 +48804,61 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1733:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1734:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1735:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11498,12 +48875,56 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11514,12 +48935,56 @@ entry:
}
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11531,13 +48996,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11551,13 +49078,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11573,12 +49162,57 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11591,13 +49225,76 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11614,12 +49311,55 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11630,12 +49370,55 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11647,13 +49430,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11667,13 +49511,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11689,12 +49594,56 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11707,13 +49656,75 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11730,12 +49741,44 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1736:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1737:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1738:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11746,12 +49789,44 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1739:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1740:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1741:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11763,13 +49838,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1742:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1743:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1744:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11783,13 +49908,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1745:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1746:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1747:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11805,12 +49980,45 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1748:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1749:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1750:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11823,13 +50031,64 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1751:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1752:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1753:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11846,12 +50105,51 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1754:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1755:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1756:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11862,12 +50160,51 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1757:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1758:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1759:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11879,13 +50216,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1760:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1761:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1762:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11899,13 +50293,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1763:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1764:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1765:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11921,12 +50372,52 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1766:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1767:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1768:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11939,13 +50430,71 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1769:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1770:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1771:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11962,12 +50511,20 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -11978,12 +50535,20 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -11995,13 +50560,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12014,13 +50588,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12035,12 +50618,20 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12053,13 +50644,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12075,12 +50675,70 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1772:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1773:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1774:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12091,12 +50749,70 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1775:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1776:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1777:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12108,13 +50824,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1778:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1779:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1780:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12127,13 +50902,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1781:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1782:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1783:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12148,12 +50982,70 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1784:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1785:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1786:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12166,13 +51058,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1787:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1788:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1789:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12188,12 +51139,75 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1790:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1791:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1792:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12204,12 +51218,75 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1793:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1794:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1795:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12221,13 +51298,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1796:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1797:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1798:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12240,13 +51381,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1799:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1800:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1801:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12261,12 +51466,75 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1802:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1803:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1804:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12279,13 +51547,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1805:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1806:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1807:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12302,11 +51634,51 @@ entry:
declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12317,11 +51689,51 @@ entry:
}
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12333,11 +51745,52 @@ entry:
}
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12351,11 +51804,50 @@ entry:
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12366,11 +51858,50 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12382,11 +51913,51 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12400,11 +51971,39 @@ entry:
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1808:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1809:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1810:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12415,11 +52014,39 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1811:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1812:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1813:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12431,11 +52058,40 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1814:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1815:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1816:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12449,11 +52105,46 @@ entry:
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1817:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1818:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1819:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12464,11 +52155,46 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1820:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1821:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1822:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12480,11 +52206,47 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1823:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1824:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1825:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12498,21 +52260,23 @@ entry:
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12524,21 +52288,23 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12551,21 +52317,23 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12580,12 +52348,72 @@ entry:
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1826:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1827:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1828:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%1 = bitcast <4 x i64> %__b to <8 x float>
@@ -12596,12 +52424,72 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1829:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1830:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1831:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -12613,12 +52501,72 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1832:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1833:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1834:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
@@ -12632,12 +52580,77 @@ entry:
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1835:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1836:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1837:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%1 = bitcast <4 x i64> %__b to <8 x float>
@@ -12648,12 +52661,77 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1838:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1839:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1840:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -12665,12 +52743,77 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1841:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1842:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1843:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
@@ -12684,12 +52827,120 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1844:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1845:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1846:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1847:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1848:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1849:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1850:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1851:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12700,12 +52951,120 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1852:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1853:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1854:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1855:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1856:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1857:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1858:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1859:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12717,12 +53076,120 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1860:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1861:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1862:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1863:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1864:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1865:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1866:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1867:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load float, float* %__b
@@ -12736,12 +53203,18 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovw %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12752,12 +53225,125 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1868:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1869:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1870:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1871:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1872:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1873:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1874:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1875:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12768,12 +53354,125 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1876:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1877:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1878:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1879:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1880:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1881:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1882:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1883:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12785,12 +53484,125 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1884:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1885:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1886:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1887:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1888:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1889:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1890:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1891:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load float, float* %__b
@@ -12804,13 +53616,20 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12822,12 +53641,23 @@ entry:
declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12838,12 +53668,23 @@ entry:
}
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12855,12 +53696,24 @@ entry:
}
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12874,11 +53727,35 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12889,11 +53766,35 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12905,11 +53806,36 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12923,11 +53849,34 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12938,11 +53887,34 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12954,11 +53926,35 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12972,11 +53968,39 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1892:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1893:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1894:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12987,11 +54011,39 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1895:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1896:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1897:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -13003,11 +54055,40 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1898:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1899:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1900:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -13021,11 +54102,46 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1901:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1902:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1903:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -13036,11 +54152,46 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1904:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1905:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1906:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -13052,11 +54203,47 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1907:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1908:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1909:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -13070,12 +54257,53 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13086,12 +54314,53 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13103,12 +54372,54 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13122,12 +54433,52 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13138,12 +54489,52 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13155,12 +54546,53 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13174,12 +54606,41 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1910:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1911:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1912:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13190,12 +54651,41 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1913:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1914:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1915:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13207,12 +54697,42 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1916:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1917:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1918:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13226,12 +54746,48 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1919:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1920:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1921:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13242,12 +54798,48 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1922:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1923:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1924:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13259,12 +54851,49 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1925:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1926:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1927:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13278,12 +54907,20 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13294,12 +54931,20 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13311,12 +54956,20 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13330,12 +54983,22 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13346,12 +55009,70 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1928:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1929:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1930:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13362,12 +55083,70 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1931:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1932:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1933:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13379,12 +55158,70 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1934:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1935:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1936:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13398,12 +55235,19 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovb %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13414,12 +55258,75 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1937:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1938:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1939:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13430,12 +55337,75 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1940:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1941:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1942:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13447,12 +55417,75 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1943:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1944:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1945:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13466,13 +55499,20 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 092b139fca2f9..1d78ee26a0b9b 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -1,48 +1,48 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
; SSE2-LABEL: v8i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i16> %a, %b
%x1 = icmp sgt <8 x i16> %c, %d
@@ -53,25 +53,25 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -87,25 +87,25 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -121,29 +121,29 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <16 x i8> %a, %b
%x1 = icmp sgt <16 x i8> %c, %d
@@ -154,7 +154,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -206,11 +206,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -235,11 +235,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -264,11 +264,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $56, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $56, %xmm2, %xmm2
@@ -292,7 +292,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -344,11 +344,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -373,11 +373,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -402,11 +402,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $48, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2
@@ -430,7 +430,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm2
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -474,11 +474,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -499,11 +499,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -524,11 +524,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $32, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -552,7 +552,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
@@ -576,20 +576,20 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -605,25 +605,25 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -639,7 +639,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm3
; SSE2-SSSE3-NEXT: psrad $24, %xmm3
; SSE2-SSSE3-NEXT: pslld $24, %xmm2
@@ -652,11 +652,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $24, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
@@ -669,11 +669,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $24, %xmm3, %xmm3
; AVX512-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX512-NEXT: vpslld $24, %xmm2, %xmm2
@@ -697,7 +697,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm3
; SSE2-SSSE3-NEXT: psrad $16, %xmm3
; SSE2-SSSE3-NEXT: pslld $16, %xmm2
@@ -710,11 +710,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $16, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
@@ -727,11 +727,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $16, %xmm3, %xmm3
; AVX512-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX512-NEXT: vpslld $16, %xmm2, %xmm2
@@ -755,7 +755,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-LABEL: v8i8:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: psllw $8, %xmm3
; SSE2-NEXT: psraw $8, %xmm3
; SSE2-NEXT: psllw $8, %xmm2
@@ -770,11 +770,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i8:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: psllw $8, %xmm3
; SSSE3-NEXT: psraw $8, %xmm3
; SSSE3-NEXT: psllw $8, %xmm2
@@ -788,11 +788,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
@@ -806,11 +806,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX512-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX512-NEXT: vpsllw $8, %xmm2, %xmm2
@@ -822,7 +822,7 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i8> %a, %b
%x1 = icmp sgt <8 x i8> %c, %d
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index a6d6ca155302e..95529686a58af 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-LABEL: v4i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1
@@ -57,11 +57,11 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v4i64:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -74,12 +74,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -88,12 +88,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskps %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -110,7 +110,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; SSE2-SSSE3-LABEL: v4f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
@@ -123,11 +123,11 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; SSE2-SSSE3-NEXT: psrad $31, %xmm6
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -136,12 +136,12 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -158,7 +158,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSE2-LABEL: v16i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -181,11 +181,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSE2-NEXT: pcmpgtb %xmm4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm3, %xmm1
@@ -208,11 +208,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX1-LABEL: v16i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
@@ -225,12 +225,12 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -239,16 +239,16 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtw %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = icmp sgt <16 x i16> %a, %b
@@ -260,7 +260,7 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-LABEL: v8i32:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -287,11 +287,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm3, %xmm1
@@ -310,11 +310,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSSE3-NEXT: pand %xmm0, %xmm4
; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm4, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX1-LABEL: v8i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
@@ -328,12 +328,12 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -343,16 +343,16 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i32> %a, %b
@@ -364,7 +364,7 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
; SSE2-LABEL: v8f32:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: cmpltps %xmm1, %xmm3
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -391,11 +391,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8f32:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm1, %xmm3
@@ -414,11 +414,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; SSSE3-NEXT: pand %xmm2, %xmm6
; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm6, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -428,16 +428,16 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = fcmp ogt <8 x float> %a, %b
@@ -449,7 +449,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-LABEL: v32i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4
@@ -561,14 +561,14 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v32i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: Lcfi0:
+; AVX1-NEXT: .Lcfi0:
; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: Lcfi1:
+; AVX1-NEXT: .Lcfi1:
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: Lcfi2:
+; AVX1-NEXT: .Lcfi2:
; AVX1-NEXT: .cfi_def_cfa_register %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $32, %rsp
@@ -687,7 +687,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -696,7 +696,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: v32i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtb %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
new file mode 100644
index 0000000000000..2eba79b0297f9
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -0,0 +1,1868 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; SSE-LABEL: v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pshufb %xmm3, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
+; SSE-NEXT: pslld $31, %xmm9
+; SSE-NEXT: psrad $31, %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
+; SSE-NEXT: pslld $31, %xmm8
+; SSE-NEXT: psrad $31, %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <8 x i64> %a, %b
+ %x1 = icmp sgt <8 x i64> %c, %d
+ %y = and <8 x i1> %x0, %x1
+ %res = bitcast <8 x i1> %y to i8
+ ret i8 %res
+}
+
+define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) {
+; SSE-LABEL: v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltpd %xmm3, %xmm7
+; SSE-NEXT: cmpltpd %xmm2, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE-NEXT: pslld $31, %xmm6
+; SSE-NEXT: psrad $31, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm2, %xmm6
+; SSE-NEXT: cmpltpd %xmm1, %xmm5
+; SSE-NEXT: cmpltpd %xmm0, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE-NEXT: pslld $31, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufb %xmm2, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: psllw $15, %xmm4
+; SSE-NEXT: psraw $15, %xmm4
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
+; SSE-NEXT: pslld $31, %xmm9
+; SSE-NEXT: psrad $31, %xmm9
+; SSE-NEXT: pshufb %xmm2, %xmm9
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
+; SSE-NEXT: pslld $31, %xmm8
+; SSE-NEXT: psrad $31, %xmm8
+; SSE-NEXT: pshufb %xmm2, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pand %xmm4, %xmm8
+; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX12-LABEL: v8f64:
+; AVX12: # BB#0:
+; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX12-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2
+; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX12-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX12-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX12-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX12-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
+;
+; AVX512F-LABEL: v8f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = fcmp ogt <8 x double> %a, %b
+ %x1 = fcmp ogt <8 x double> %c, %d
+ %y = and <8 x i1> %x0, %x1
+ %res = bitcast <8 x i1> %y to i8
+ ret i8 %res
+}
+
+define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
+; SSE-LABEL: v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm5, %xmm1
+; SSE-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE-NEXT: pshufb %xmm5, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE-NEXT: pshufb %xmm5, %xmm3
+; SSE-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE-NEXT: pshufb %xmm5, %xmm2
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm5, %xmm11
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm5, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0]
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm5, %xmm10
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm5, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE-NEXT: pand %xmm2, %xmm9
+; SSE-NEXT: pextrb $15, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi1:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi2:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <32 x i16> %a, %b
+ %x1 = icmp sgt <32 x i16> %c, %d
+ %y = and <32 x i1> %x0, %x1
+ %res = bitcast <32 x i1> %y to i32
+ ret i32 %res
+}
+
+define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
+; SSE-LABEL: v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: pshufb %xmm7, %xmm2
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: psllw $15, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pshufb %xmm7, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pshufb %xmm7, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: pshufb %xmm3, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtb %xmm0, %xmm4
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm7, %xmm11
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm7, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE-NEXT: psllw $15, %xmm9
+; SSE-NEXT: psraw $15, %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm7, %xmm10
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm7, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $7, %xmm8
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pcmpgtb %xmm8, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm3
+; AVX1-NEXT: vpacksswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX2-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <16 x i32> %a, %b
+ %x1 = icmp sgt <16 x i32> %c, %d
+ %y = and <16 x i1> %x0, %x1
+ %res = bitcast <16 x i1> %y to i16
+ ret i16 %res
+}
+
+define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) {
+; SSE-LABEL: v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: pshufb %xmm3, %xmm6
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT: psllw $15, %xmm6
+; SSE-NEXT: psraw $15, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm2, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: pshufb %xmm3, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: pshufb %xmm3, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-NEXT: psllw $15, %xmm4
+; SSE-NEXT: psraw $15, %xmm4
+; SSE-NEXT: pshufb %xmm2, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: psllw $7, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtb %xmm4, %xmm5
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm3, %xmm11
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE-NEXT: psllw $15, %xmm9
+; SSE-NEXT: psraw $15, %xmm9
+; SSE-NEXT: pshufb %xmm2, %xmm9
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm3, %xmm10
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pshufb %xmm2, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $7, %xmm8
+; SSE-NEXT: pand %xmm1, %xmm8
+; SSE-NEXT: pcmpgtb %xmm8, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX12-LABEL: v16f32:
+; AVX12: # BB#0:
+; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX12-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm5
+; AVX12-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX12-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
+; AVX12-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm4
+; AVX12-NEXT: vextractf128 $1, %ymm4, %xmm6
+; AVX12-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX12-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX12-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX12-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
+;
+; AVX512F-LABEL: v16f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = fcmp ogt <16 x float> %a, %b
+ %x1 = fcmp ogt <16 x float> %c, %d
+ %y = and <16 x i1> %x0, %x1
+ %res = bitcast <16 x i1> %y to i16
+ ret i16 %res
+}
+
+define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; SSE-LABEL: v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: pextrb $15, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtb %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl (%rsp), %ecx
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl (%rsp), %ecx
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi4:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi5:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl (%rsp), %ecx
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: shlq $32, %rax
+; AVX512F-NEXT: orq %rcx, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <64 x i8> %a, %b
+ %x1 = icmp sgt <64 x i8> %c, %d
+ %y = and <64 x i1> %x0, %x1
+ %res = bitcast <64 x i1> %y to i64
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
new file mode 100644
index 0000000000000..9b6401d1a76c9
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -0,0 +1,3483 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+;
+; 128-bit vectors
+;
+
+define <2 x i64> @ext_i2_2i64(i2 %a0) {
+; SSE2-SSSE3-LABEL: ext_i2_2i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $3, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm1
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm0
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i2_2i64:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $3, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shlq $63, %rax
+; AVX12-NEXT: sarq $63, %rax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i2_2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $3, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ %2 = sext <2 x i1> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i32> @ext_i4_4i32(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $60, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $61, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i4_4i32:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $15, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shlq $60, %rax
+; AVX12-NEXT: sarq $63, %rax
+; AVX12-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @ext_i8_8i16(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shrq $7, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $57, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $58, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $59, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $60, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $61, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i8_8i16:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $60, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $59, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $58, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $57, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrq $7, %rax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @ext_i16_16i8(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi0:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi1:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi2:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi3:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi4:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi5:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi6:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi7:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi8:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi9:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi10:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi11:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; SSE2-SSSE3-NEXT: movq %rax, %r8
+; SSE2-SSSE3-NEXT: movq %rax, %r9
+; SSE2-SSSE3-NEXT: movq %rax, %r10
+; SSE2-SSSE3-NEXT: movq %rax, %r11
+; SSE2-SSSE3-NEXT: movq %rax, %r14
+; SSE2-SSSE3-NEXT: movq %rax, %r15
+; SSE2-SSSE3-NEXT: movq %rax, %r12
+; SSE2-SSSE3-NEXT: movq %rax, %r13
+; SSE2-SSSE3-NEXT: movq %rax, %rbx
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: movq %rax, %rdx
+; SSE2-SSSE3-NEXT: movq %rax, %rsi
+; SSE2-SSSE3-NEXT: movq %rax, %rdi
+; SSE2-SSSE3-NEXT: movq %rax, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rbp
+; SSE2-SSSE3-NEXT: movsbq %al, %rax
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm2
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
+; SSE2-SSSE3-NEXT: shrq $7, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i16_16i8:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $60, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $59, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $58, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $57, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movsbq %al, %rcx
+; AVX12-NEXT: shrq $7, %rcx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $55, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $54, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $53, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $52, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $51, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $50, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $49, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrq $15, %rax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x i64> @ext_i4_4i64(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: shlq $63, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = sext <4 x i1> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @ext_i8_8i32(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $59, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $59, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @ext_i16_16i16(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm0
+; SSE2-SSSE3-NEXT: psraw $15, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm1
+; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: .Lcfi6:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi7:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi8:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi9:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi10:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Lcfi11:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $55, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %r10
+; AVX1-NEXT: movq %rax, %r11
+; AVX1-NEXT: movq %rax, %r14
+; AVX1-NEXT: movq %rax, %r15
+; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movq %rax, %r12
+; AVX1-NEXT: movq %rax, %r13
+; AVX1-NEXT: movq %rax, %rbx
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: movq %rax, %rsi
+; AVX1-NEXT: movsbq %al, %rbp
+; AVX1-NEXT: shlq $54, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shlq $53, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $52, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $51, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $50, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $49, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $15, %r9
+; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $63, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vmovd %r13d, %xmm1
+; AVX1-NEXT: shlq $62, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shrq $7, %rbp
+; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: .Lcfi3:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: .Lcfi4:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Lcfi5:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: .Lcfi6:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi7:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi8:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi9:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi10:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Lcfi11:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $55, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: movq %rax, %r14
+; AVX2-NEXT: movq %rax, %r15
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movq %rax, %r12
+; AVX2-NEXT: movq %rax, %r13
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: movsbq %al, %rbp
+; AVX2-NEXT: shlq $54, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shlq $53, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $52, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $51, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $50, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $49, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrq $15, %r9
+; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $63, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vmovd %r13d, %xmm1
+; AVX2-NEXT: shlq $62, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shrq $7, %rbp
+; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @ext_i32_32i8(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi12:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi13:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi14:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi15:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi16:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi17:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi18:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi19:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi20:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi21:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi22:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi23:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
+; SSE2-SSSE3-NEXT: movq %rbx, %r8
+; SSE2-SSSE3-NEXT: movq %rbx, %r9
+; SSE2-SSSE3-NEXT: movq %rbx, %r10
+; SSE2-SSSE3-NEXT: movq %rbx, %r11
+; SSE2-SSSE3-NEXT: movq %rbx, %r14
+; SSE2-SSSE3-NEXT: movq %rbx, %r15
+; SSE2-SSSE3-NEXT: movq %rbx, %r12
+; SSE2-SSSE3-NEXT: movq %rbx, %r13
+; SSE2-SSSE3-NEXT: movq %rbx, %rdi
+; SSE2-SSSE3-NEXT: movq %rbx, %rcx
+; SSE2-SSSE3-NEXT: movq %rbx, %rdx
+; SSE2-SSSE3-NEXT: movq %rbx, %rbp
+; SSE2-SSSE3-NEXT: movq %rbx, %rsi
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: shrq $15, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
+; SSE2-SSSE3-NEXT: shlq $61, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: shlq $58, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm13
+; SSE2-SSSE3-NEXT: shlq $59, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm7
+; SSE2-SSSE3-NEXT: shlq $57, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: shrq $7, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm4
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm4
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi12:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi13:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi14:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: .Lcfi15:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi16:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi17:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi18:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi19:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, (%rsp)
+; AVX1-NEXT: movslq (%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $45, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $44, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $43, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $42, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $41, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $40, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $39, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $38, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $37, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $36, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $35, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $34, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: shlq $33, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vmovd %r8d, %xmm1
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX1-NEXT: shlq $62, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi12:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi13:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi14:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: .Lcfi15:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi16:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi17:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi18:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi19:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, (%rsp)
+; AVX2-NEXT: movslq (%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $45, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $44, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $43, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $42, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $41, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $40, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $39, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $38, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $37, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $36, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $35, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $34, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: shlq $33, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vmovd %r8d, %xmm1
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT: shlq $62, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = sext <32 x i1> %1 to <32 x i8>
+ ret <32 x i8> %2
+}
+
+;
+; 512-bit vectors
+;
+
+define <8 x i64> @ext_i8_8i64(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm3
+; SSE2-SSSE3-NEXT: psrad $31, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <16 x i32> @ext_i16_16i32(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm3
+; SSE2-SSSE3-NEXT: psrad $31, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i32>
+ ret <16 x i32> %2
+}
+
+define <32 x i16> @ext_i32_32i16(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %edi, %eax
+; SSE2-SSSE3-NEXT: shrl $16, %eax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm0
+; SSE2-SSSE3-NEXT: psraw $15, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm1
+; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm2
+; SSE2-SSSE3-NEXT: psraw $15, %xmm2
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm3
+; SSE2-SSSE3-NEXT: psraw $15, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi20:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi21:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi22:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi23:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi24:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi25:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi26:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi27:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: movl %edi, %r12d
+; AVX1-NEXT: movl %edi, %r15d
+; AVX1-NEXT: movl %edi, %r14d
+; AVX1-NEXT: movl %edi, %ebx
+; AVX1-NEXT: movl %edi, %r11d
+; AVX1-NEXT: movl %edi, %r10d
+; AVX1-NEXT: movl %edi, %r9d
+; AVX1-NEXT: movl %edi, %r8d
+; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movl %edi, %edx
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $4, %esi
+; AVX1-NEXT: andl $1, %esi
+; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX1-NEXT: shrl $5, %r8d
+; AVX1-NEXT: andl $1, %r8d
+; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $6, %r9d
+; AVX1-NEXT: andl $1, %r9d
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %r10d
+; AVX1-NEXT: andl $1, %r10d
+; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $8, %r11d
+; AVX1-NEXT: andl $1, %r11d
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $9, %ebx
+; AVX1-NEXT: andl $1, %ebx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $10, %r14d
+; AVX1-NEXT: andl $1, %r14d
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $11, %r15d
+; AVX1-NEXT: andl $1, %r15d
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $12, %r12d
+; AVX1-NEXT: andl $1, %r12d
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $13, %r13d
+; AVX1-NEXT: andl $1, %r13d
+; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi20:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi21:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi22:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi23:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi24:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi25:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi26:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi27:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, %r13d
+; AVX2-NEXT: movl %edi, %r12d
+; AVX2-NEXT: movl %edi, %r15d
+; AVX2-NEXT: movl %edi, %r14d
+; AVX2-NEXT: movl %edi, %ebx
+; AVX2-NEXT: movl %edi, %r11d
+; AVX2-NEXT: movl %edi, %r10d
+; AVX2-NEXT: movl %edi, %r9d
+; AVX2-NEXT: movl %edi, %r8d
+; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movl %edi, %edx
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $4, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrl $5, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $6, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $8, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $9, %ebx
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $10, %r14d
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $11, %r15d
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $12, %r12d
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $13, %r13d
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = sext <32 x i1> %1 to <32 x i16>
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @ext_i64_64i8(i64 %a0) {
+; SSE2-SSSE3-LABEL: ext_i64_64i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi24:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi25:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi26:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi27:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi28:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi29:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi30:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi31:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi32:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi33:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi34:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi35:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $32, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $48, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
+; SSE2-SSSE3-NEXT: movq %rbx, %r8
+; SSE2-SSSE3-NEXT: movq %rbx, %r9
+; SSE2-SSSE3-NEXT: movq %rbx, %r10
+; SSE2-SSSE3-NEXT: movq %rbx, %r11
+; SSE2-SSSE3-NEXT: movq %rbx, %r14
+; SSE2-SSSE3-NEXT: movq %rbx, %r15
+; SSE2-SSSE3-NEXT: movq %rbx, %r12
+; SSE2-SSSE3-NEXT: movq %rbx, %r13
+; SSE2-SSSE3-NEXT: movq %rbx, %rdi
+; SSE2-SSSE3-NEXT: movq %rbx, %rcx
+; SSE2-SSSE3-NEXT: movq %rbx, %rdx
+; SSE2-SSSE3-NEXT: movq %rbx, %rsi
+; SSE2-SSSE3-NEXT: movq %rbx, %rbp
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: shrq $15, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm2
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
+; SSE2-SSSE3-NEXT: shlq $61, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: shlq $58, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm13
+; SSE2-SSSE3-NEXT: shlq $59, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
+; SSE2-SSSE3-NEXT: shlq $57, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: shrq $7, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm13
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm15
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm7
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm11
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm13
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm15
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm11
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm6
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm7
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm6
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm8
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm7
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm7
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm5
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i64_64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi28:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi29:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi30:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi31:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi32:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi33:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi34:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi35:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $32, %rdi
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $45, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $44, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $43, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $42, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $41, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $40, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $39, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $38, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $37, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $36, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $35, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $34, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: shlq $33, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vmovd %r8d, %xmm1
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX1-NEXT: shlq $62, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $45, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $44, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $43, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $42, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: shlq $41, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $40, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $39, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $38, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $37, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $36, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $35, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: shlq $34, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $33, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: shlq $62, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
+; AVX1-NEXT: shlq $61, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rdi
+; AVX1-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi28:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi29:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi30:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi31:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi32:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi33:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi34:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi35:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $32, %rdi
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $45, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $44, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $43, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $42, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $41, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $40, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $39, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $38, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $37, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $36, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $35, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $34, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: shlq $33, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vmovd %r8d, %xmm1
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT: shlq $62, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $45, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $43, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $42, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: shlq $41, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $40, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $39, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $38, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $37, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $36, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $35, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: shlq $34, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $33, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: shlq $62, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
+; AVX2-NEXT: shlq $61, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rdi
+; AVX2-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i64_64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ %2 = sext <64 x i1> %1 to <64 x i8>
+ ret <64 x i8> %2
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
new file mode 100644
index 0000000000000..aa9e60df14044
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -0,0 +1,3279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+;
+; 128-bit vectors
+;
+
+define <2 x i64> @ext_i2_2i64(i2 %a0) {
+; SSE2-SSSE3-LABEL: ext_i2_2i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $3, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i2_2i64:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $3, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i2_2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $3, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ %2 = zext <2 x i1> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i32> @ext_i4_4i32(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = zext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @ext_i8_8i16(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i8_8i16:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $7, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k5
+; AVX512-NEXT: kshiftlw $8, %k5, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kshiftlw $9, %k5, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kshiftlw $10, %k5, %k2
+; AVX512-NEXT: kshiftrw $15, %k2, %k2
+; AVX512-NEXT: kshiftlw $11, %k5, %k3
+; AVX512-NEXT: kshiftrw $15, %k3, %k3
+; AVX512-NEXT: kshiftlw $12, %k5, %k4
+; AVX512-NEXT: kshiftrw $15, %k4, %k4
+; AVX512-NEXT: kshiftlw $13, %k5, %k6
+; AVX512-NEXT: kshiftrw $15, %k6, %k6
+; AVX512-NEXT: kshiftlw $15, %k5, %k7
+; AVX512-NEXT: kshiftrw $15, %k7, %k7
+; AVX512-NEXT: kshiftlw $14, %k5, %k5
+; AVX512-NEXT: kshiftrw $15, %k5, %k5
+; AVX512-NEXT: kmovd %k5, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: kmovd %k7, %ecx
+; AVX512-NEXT: andl $1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k6, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k4, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k3, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k2, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @ext_i16_16i8(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i16_16i8:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $7, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $8, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $9, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $10, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $11, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $12, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $13, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $14, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $15, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Lcfi0:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Lcfi1:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Lcfi2:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: .Lcfi3:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: .Lcfi4:
+; AVX512-NEXT: .cfi_def_cfa_offset 48
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Lcfi5:
+; AVX512-NEXT: .cfi_def_cfa_offset 56
+; AVX512-NEXT: .Lcfi6:
+; AVX512-NEXT: .cfi_offset %rbx, -56
+; AVX512-NEXT: .Lcfi7:
+; AVX512-NEXT: .cfi_offset %r12, -48
+; AVX512-NEXT: .Lcfi8:
+; AVX512-NEXT: .cfi_offset %r13, -40
+; AVX512-NEXT: .Lcfi9:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Lcfi10:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Lcfi11:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r8d
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r9d
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r10d
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r11d
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r14d
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r15d
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r12d
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r13d
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %esi
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ebx
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ebp
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %edi
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ecx
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %edx
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: vmovd %r9d, %xmm0
+; AVX512-NEXT: kmovd %k0, %r9d
+; AVX512-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x i64> @ext_i4_4i64(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = zext <4 x i1> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @ext_i8_8i32(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: shrl $4, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: shrl $4, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @ext_i16_16i16(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: shrl $8, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $15, %ecx
+; AVX1-NEXT: movzwl %cx, %ecx
+; AVX1-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: shrl $8, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $15, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @ext_i32_32i8(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrl $15, %edi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shrl $15, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = zext <32 x i1> %1 to <32 x i8>
+ ret <32 x i8> %2
+}
+
+;
+; 512-bit vectors
+;
+
+define <8 x i64> @ext_i8_8i64(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <16 x i32> @ext_i16_16i32(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i32>
+ ret <16 x i32> %2
+}
+
+define <32 x i16> @ext_i32_32i16(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %edi, %eax
+; SSE2-SSSE3-NEXT: shrl $16, %eax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi6:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi7:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi8:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi9:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi10:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: movl %edi, %r12d
+; AVX1-NEXT: movl %edi, %r15d
+; AVX1-NEXT: movl %edi, %r14d
+; AVX1-NEXT: movl %edi, %ebx
+; AVX1-NEXT: movl %edi, %r11d
+; AVX1-NEXT: movl %edi, %r10d
+; AVX1-NEXT: movl %edi, %r9d
+; AVX1-NEXT: movl %edi, %r8d
+; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movl %edi, %edx
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $4, %esi
+; AVX1-NEXT: andl $1, %esi
+; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX1-NEXT: shrl $5, %r8d
+; AVX1-NEXT: andl $1, %r8d
+; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $6, %r9d
+; AVX1-NEXT: andl $1, %r9d
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %r10d
+; AVX1-NEXT: andl $1, %r10d
+; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $8, %r11d
+; AVX1-NEXT: andl $1, %r11d
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $9, %ebx
+; AVX1-NEXT: andl $1, %ebx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $10, %r14d
+; AVX1-NEXT: andl $1, %r14d
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $11, %r15d
+; AVX1-NEXT: andl $1, %r15d
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $12, %r12d
+; AVX1-NEXT: andl $1, %r12d
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $13, %r13d
+; AVX1-NEXT: andl $1, %r13d
+; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi4:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi5:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi6:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi7:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi8:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi9:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi10:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, %r13d
+; AVX2-NEXT: movl %edi, %r12d
+; AVX2-NEXT: movl %edi, %r15d
+; AVX2-NEXT: movl %edi, %r14d
+; AVX2-NEXT: movl %edi, %ebx
+; AVX2-NEXT: movl %edi, %r11d
+; AVX2-NEXT: movl %edi, %r10d
+; AVX2-NEXT: movl %edi, %r9d
+; AVX2-NEXT: movl %edi, %r8d
+; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movl %edi, %edx
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $4, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrl $5, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $6, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $8, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $9, %ebx
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $10, %r14d
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $11, %r15d
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $12, %r12d
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $13, %r13d
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = zext <32 x i1> %1 to <32 x i16>
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @ext_i64_64i8(i64 %a0) {
+; SSE2-SSSE3-LABEL: ext_i64_64i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $32, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $48, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm7
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i64_64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi11:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi12:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi13:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $49, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movq %rdi, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $50, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $51, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $52, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $53, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $54, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $55, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $57, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $58, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $59, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $60, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $61, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $62, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $33, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movq %rdi, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $34, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $35, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $36, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $37, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $38, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $39, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $40, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $41, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $42, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $43, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $44, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $45, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $46, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX1-NEXT: shrq $47, %rdi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi11:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi12:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi13:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $49, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movq %rdi, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $50, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $51, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $52, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $53, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $54, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $55, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $56, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $57, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $58, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $59, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $60, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $61, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $62, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $33, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movq %rdi, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $34, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $35, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $36, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $37, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $38, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $39, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $40, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $41, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $42, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $43, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $44, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $45, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $46, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX2-NEXT: shrq $47, %rdi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i64_64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k1
+; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ %2 = zext <64 x i1> %1 to <64 x i8>
+ ret <64 x i8> %2
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
new file mode 100644
index 0000000000000..a190e05755228
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -0,0 +1,685 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i2_2i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i2_2i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i2_2i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i4_4i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: bitcast_i4_4i1:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i4_4i1:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i4_4i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i8_8i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i8_8i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $7, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i8_8i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i16_16i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i16_16i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $7, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $8, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $9, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $10, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $11, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $12, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $13, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $14, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $15, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i16_16i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i32_32i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %esi, (%rdi)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: bitcast_i32_32i1:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrl $15, %edi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i32_32i1:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shrl $15, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i32_32i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @bitcast_i64_64i1(i64 %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i64_64i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movq %rsi, (%rdi)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i64_64i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movq %rsi, (%rdi)
+; AVX12-NEXT: movq %rdi, %rax
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i64_64i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ ret <64 x i1> %1
+}
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 9bf7b41a4f26a..5616276da08d0 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -1,41 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: v8i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <8 x i16> %a, %b
%res = bitcast <8 x i1> %x to i8
@@ -44,21 +44,21 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -71,21 +71,21 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -98,24 +98,24 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <16 x i8> %a, %b
%res = bitcast <16 x i1> %x to i16
@@ -124,7 +124,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -151,11 +151,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -168,11 +168,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -185,11 +185,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $56, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
@@ -206,7 +206,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -233,11 +233,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -250,11 +250,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -267,11 +267,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $48, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
@@ -288,7 +288,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm0
@@ -311,11 +311,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -326,11 +326,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -341,11 +341,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
@@ -362,7 +362,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
@@ -375,18 +375,18 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -399,21 +399,21 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -426,29 +426,29 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm1
; SSE2-SSSE3-NEXT: psrad $24, %xmm1
; SSE2-SSSE3-NEXT: pslld $24, %xmm0
; SSE2-SSSE3-NEXT: psrad $24, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $24, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
@@ -465,29 +465,29 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm1
; SSE2-SSSE3-NEXT: psrad $16, %xmm1
; SSE2-SSSE3-NEXT: pslld $16, %xmm0
; SSE2-SSSE3-NEXT: psrad $16, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $16, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
@@ -504,7 +504,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSE2-LABEL: v8i8:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: psllw $8, %xmm0
@@ -513,11 +513,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i8:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: psllw $8, %xmm1
; SSSE3-NEXT: psraw $8, %xmm1
; SSSE3-NEXT: psllw $8, %xmm0
@@ -525,11 +525,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
@@ -537,18 +537,18 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <8 x i8> %a, %b
%res = bitcast <8 x i1> %x to i8
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index b2c619c48d4d3..86475c42e79e7 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -1,23 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: v16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v16i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = icmp sgt <16 x i16> %a, %b
@@ -26,19 +50,53 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
}
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v8i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = icmp sgt <8 x i32> %a, %b
@@ -47,19 +105,51 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
}
define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-NEXT: packsswb %xmm3, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8f32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: cmpltps %xmm1, %xmm3
+; SSSE3-NEXT: cmpltps %xmm0, %xmm2
+; SSSE3-NEXT: packsswb %xmm3, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v8f32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = fcmp ogt <8 x float> %a, %b
@@ -68,15 +158,241 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
}
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-SSSE3-LABEL: v32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: shll $16, %ecx
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: orl %ecx, %eax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v32i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v32i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: vzeroupper
@@ -87,16 +403,56 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
}
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: v4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v4i64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -109,16 +465,35 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
}
define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE2-SSSE3-LABEL: v4f64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v4f64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
new file mode 100644
index 0000000000000..4a5ef99a86537
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -0,0 +1,1377 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE-NEXT: pextrb $14, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE-NEXT: pextrb $14, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE-NEXT: pextrb $14, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE-NEXT: pextrb $14, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi1:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi2:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <32 x i16> %a, %b
+ %res = bitcast <32 x i1> %x to i32
+ ret i32 %res
+}
+
+define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
+; SSE-LABEL: v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <16 x i32> %a, %b
+ %res = bitcast <16 x i1> %x to i16
+ ret i16 %res
+}
+
+define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
+; SSE-LABEL: v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packsswb %xmm6, %xmm4
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = fcmp ogt <16 x float> %a, %b
+ %res = bitcast <16 x i1> %x to i16
+ ret i16 %res
+}
+
+define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE-NEXT: pextrb $15, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE-NEXT: pextrb $15, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE-NEXT: pextrb $15, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE-NEXT: pextrb $15, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl (%rsp), %ecx
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl (%rsp), %ecx
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi4:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi5:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl (%rsp), %ecx
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: shlq $32, %rax
+; AVX512F-NEXT: orq %rcx, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <64 x i8> %a, %b
+ %res = bitcast <64 x i1> %x to i64
+ ret i64 %res
+}
+
+define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) {
+; SSE-LABEL: v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <8 x i64> %a, %b
+ %res = bitcast <8 x i1> %x to i8
+ ret i8 %res
+}
+
+define i8 @v8f64(<8 x double> %a, <8 x double> %b) {
+; SSE-LABEL: v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltpd %xmm3, %xmm7
+; SSE-NEXT: cmpltpd %xmm2, %xmm6
+; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: cmpltpd %xmm1, %xmm5
+; SSE-NEXT: cmpltpd %xmm0, %xmm4
+; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packsswb %xmm6, %xmm4
+; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = fcmp ogt <8 x double> %a, %b
+ %res = bitcast <8 x i1> %x to i8
+ ret i8 %res
+}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index c7de65d84507b..b3f6534d14b3b 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -354,6 +354,7 @@ define void @unnatural_cfg2() {
; single-source GCC.
; CHECK-LABEL: unnatural_cfg2
; CHECK: %entry
+; CHECK: %loop.header
; CHECK: %loop.body1
; CHECK: %loop.body2
; CHECK: %loop.body4
@@ -361,7 +362,6 @@ define void @unnatural_cfg2() {
; CHECK: %loop.inner2.begin
; CHECK: %loop.body3
; CHECK: %loop.inner1.begin
-; CHECK: %loop.header
; CHECK: %bail
entry:
@@ -1491,6 +1491,102 @@ ret: ; preds = %endif, %then
ret void
}
+define i32 @not_rotate_if_extra_branch(i32 %count) {
+; Test checks that there is no loop rotation
+; if it introduces extra branch.
+; Specifically in this case because best exit is .header
+; but it has fallthrough to .middle block and last block in
+; loop chain .slow does not have afallthrough to .header.
+; CHECK-LABEL: not_rotate_if_extra_branch
+; CHECK: %.entry
+; CHECK: %.header
+; CHECK: %.middle
+; CHECK: %.backedge
+; CHECK: %.slow
+; CHECK: %.bailout
+; CHECK: %.stop
+.entry:
+ %sum.0 = shl nsw i32 %count, 1
+ br label %.header
+
+.header:
+ %i = phi i32 [ %i.1, %.backedge ], [ 0, %.entry ]
+ %sum = phi i32 [ %sum.1, %.backedge ], [ %sum.0, %.entry ]
+ %is_exc = icmp sgt i32 %i, 9000000
+ br i1 %is_exc, label %.bailout, label %.middle, !prof !13
+
+.bailout:
+ %sum.2 = add nsw i32 %count, 1
+ br label %.stop
+
+.middle:
+ %pr.1 = and i32 %i, 1023
+ %pr.2 = icmp eq i32 %pr.1, 0
+ br i1 %pr.2, label %.slow, label %.backedge, !prof !14
+
+.slow:
+ tail call void @effect(i32 %sum)
+ br label %.backedge
+
+.backedge:
+ %sum.1 = add nsw i32 %i, %sum
+ %i.1 = add nsw i32 %i, 1
+ %end = icmp slt i32 %i.1, %count
+ br i1 %end, label %.header, label %.stop, !prof !15
+
+.stop:
+ %sum.phi = phi i32 [ %sum.1, %.backedge ], [ %sum.2, %.bailout ]
+ ret i32 %sum.phi
+}
+
+define i32 @not_rotate_if_extra_branch_regression(i32 %count, i32 %init) {
+; This is a regression test against patch avoid loop rotation if
+; it introduce an extra btanch.
+; CHECK-LABEL: not_rotate_if_extra_branch_regression
+; CHECK: %.entry
+; CHECK: %.first_backedge
+; CHECK: %.slow
+; CHECK: %.second_header
+.entry:
+ %sum.0 = shl nsw i32 %count, 1
+ br label %.first_header
+
+.first_header:
+ %i = phi i32 [ %i.1, %.first_backedge ], [ 0, %.entry ]
+ %is_bo1 = icmp sgt i32 %i, 9000000
+ br i1 %is_bo1, label %.bailout, label %.first_backedge, !prof !14
+
+.first_backedge:
+ %i.1 = add nsw i32 %i, 1
+ %end = icmp slt i32 %i.1, %count
+ br i1 %end, label %.first_header, label %.second_header, !prof !13
+
+.second_header:
+ %j = phi i32 [ %j.1, %.second_backedge ], [ %init, %.first_backedge ]
+ %end.2 = icmp sgt i32 %j, %count
+ br i1 %end.2, label %.stop, label %.second_middle, !prof !14
+
+.second_middle:
+ %is_slow = icmp sgt i32 %j, 9000000
+ br i1 %is_slow, label %.slow, label %.second_backedge, !prof !14
+
+.slow:
+ tail call void @effect(i32 %j)
+ br label %.second_backedge
+
+.second_backedge:
+ %j.1 = add nsw i32 %j, 1
+ %end.3 = icmp slt i32 %j, 10000000
+ br i1 %end.3, label %.second_header, label %.stop, !prof !13
+
+.stop:
+ %res = add nsw i32 %j, %i.1
+ ret i32 %res
+
+.bailout:
+ ret i32 0
+}
+
declare void @effect(i32)
!5 = !{!"branch_weights", i32 84, i32 16}
@@ -1501,3 +1597,6 @@ declare void @effect(i32)
!10 = !{!"branch_weights", i32 90, i32 10}
!11 = !{!"branch_weights", i32 1, i32 1}
!12 = !{!"branch_weights", i32 5, i32 3}
+!13 = !{!"branch_weights", i32 1, i32 1}
+!14 = !{!"branch_weights", i32 1, i32 1023}
+!15 = !{!"branch_weights", i32 4095, i32 1}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index a0a1c3646624f..7f7f9791d9038 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,45 +1,62 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: cmovnel %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%t2 = icmp ne i32 %t1, 0
%t3 = select i1 %t2, i32 %a, i32 %b
ret i32 %t3
-; CHECK: foo
-; CHECK: ptest
-; CHECK-NOT: testl
-; CHECK: cmov
-; CHECK: ret
}
define i32 @bar(<2 x i64> %c) {
+; CHECK-LABEL: bar:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: jne .LBB1_2
+; CHECK-NEXT: # BB#1: # %if-true-block
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_2: # %endif-block
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: retq
entry:
%0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%1 = icmp ne i32 %0, 0
br i1 %1, label %if-true-block, label %endif-block
-if-true-block: ; preds = %entry
+if-true-block:
ret i32 0
-endif-block: ; preds = %entry,
+endif-block:
ret i32 1
-; CHECK: bar
-; CHECK: ptest
-; CHECK-NOT: testl
-; CHECK: jne
-; CHECK: ret
}
define i32 @bax(<2 x i64> %c) {
+; CHECK-LABEL: bax:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%t2 = icmp eq i32 %t1, 1
%t3 = zext i1 %t2 to i32
ret i32 %t3
-; CHECK: bax
-; CHECK: ptest
-; CHECK-NOT: cmpl
-; CHECK: ret
}
-define i16 @rnd16(i16 %arg) nounwind uwtable {
+define i16 @rnd16(i16 %arg) nounwind {
+; CHECK-LABEL: rnd16:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandw %cx
+; CHECK-NEXT: cmovbw %di, %ax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdrand.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
%3 = extractvalue { i16, i32 } %1, 1
@@ -47,14 +64,16 @@ define i16 @rnd16(i16 %arg) nounwind uwtable {
%5 = select i1 %4, i16 0, i16 %arg
%6 = add i16 %5, %2
ret i16 %6
-; CHECK: rnd16
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i32 @rnd32(i32 %arg) nounwind uwtable {
+define i32 @rnd32(i32 %arg) nounwind {
+; CHECK-LABEL: rnd32:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandl %ecx
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%2 = extractvalue { i32, i32 } %1, 0
%3 = extractvalue { i32, i32 } %1, 1
@@ -62,14 +81,16 @@ define i32 @rnd32(i32 %arg) nounwind uwtable {
%5 = select i1 %4, i32 0, i32 %arg
%6 = add i32 %5, %2
ret i32 %6
-; CHECK: rnd32
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i64 @rnd64(i64 %arg) nounwind uwtable {
+define i64 @rnd64(i64 %arg) nounwind {
+; CHECK-LABEL: rnd64:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandq %rcx
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: retq
%1 = tail call { i64, i32 } @llvm.x86.rdrand.64() nounwind
%2 = extractvalue { i64, i32 } %1, 0
%3 = extractvalue { i64, i32 } %1, 1
@@ -77,14 +98,17 @@ define i64 @rnd64(i64 %arg) nounwind uwtable {
%5 = select i1 %4, i64 0, i64 %arg
%6 = add i64 %5, %2
ret i64 %6
-; CHECK: rnd64
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i16 @seed16(i16 %arg) nounwind uwtable {
+define i16 @seed16(i16 %arg) nounwind {
+; CHECK-LABEL: seed16:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedw %cx
+; CHECK-NEXT: cmovbw %di, %ax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdseed.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
%3 = extractvalue { i16, i32 } %1, 1
@@ -92,14 +116,16 @@ define i16 @seed16(i16 %arg) nounwind uwtable {
%5 = select i1 %4, i16 0, i16 %arg
%6 = add i16 %5, %2
ret i16 %6
-; CHECK: seed16
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i32 @seed32(i32 %arg) nounwind uwtable {
+define i32 @seed32(i32 %arg) nounwind {
+; CHECK-LABEL: seed32:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedl %ecx
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call { i32, i32 } @llvm.x86.rdseed.32() nounwind
%2 = extractvalue { i32, i32 } %1, 0
%3 = extractvalue { i32, i32 } %1, 1
@@ -107,14 +133,16 @@ define i32 @seed32(i32 %arg) nounwind uwtable {
%5 = select i1 %4, i32 0, i32 %arg
%6 = add i32 %5, %2
ret i32 %6
-; CHECK: seed32
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i64 @seed64(i64 %arg) nounwind uwtable {
+define i64 @seed64(i64 %arg) nounwind {
+; CHECK-LABEL: seed64:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedq %rcx
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: retq
%1 = tail call { i64, i32 } @llvm.x86.rdseed.64() nounwind
%2 = extractvalue { i64, i32 } %1, 0
%3 = extractvalue { i64, i32 } %1, 1
@@ -122,11 +150,6 @@ define i64 @seed64(i64 %arg) nounwind uwtable {
%5 = select i1 %4, i64 0, i64 %arg
%6 = add i64 %5, %2
ret i64 %6
-; CHECK: seed64
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index bbe31c5c2ac58..14bdb3853b031 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -1,13 +1,12 @@
-; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py
-; Assertions for constant pools have been added MANUALLY.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64
;===-----------------------------------------------------------------------------===
; This test checks the ability to recognize a cross element pattern of
@@ -17,20 +16,31 @@
; <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
;===-----------------------------------------------------------------------------===
-; ALL: LCPI0
-; ALL-NEXT: .short 256 # 0x100
-
define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i16:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i16:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
@@ -40,45 +50,48 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
}
-; ALL: .LCPI1
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI1
-; AVX-NEXT .long 50462976 # float 3.82047143E-37
-
define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f16xi8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
%res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
ret <16 x i8> %res2
}
-; ALL64: .LCPI2
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; AVX: .LCPI2
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -86,38 +99,56 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f16xi8_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
%res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
ret <16 x i8> %res2
}
-; ALL: .LCPI3
-; ALL-NEXT: .short 256 # 0x100
-
define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i16:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i16:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
@@ -127,155 +158,273 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
}
-; ALL: .LCPI4
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI4
-; AVX-NEXT: .long 50462976 # float 3.82047143E-37
-
define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f32xi8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
ret <32 x i8> %res2
}
-; ALL64: .LCPI5
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; AVX: .LCPI5
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i64:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f32xi8_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
ret <32 x i8> %res2
}
-; ALL: .LCPI6
-; ALL-NEXT: .byte 0 # 0x0
-; ALL-NEXT: .byte 1 # 0x1
-; ALL-NEXT: .byte 2 # 0x2
-; ALL-NEXT: .byte 3 # 0x3
-; ALL-NEXT: .byte 4 # 0x4
-; ALL-NEXT: .byte 5 # 0x5
-; ALL-NEXT: .byte 6 # 0x6
-; ALL-NEXT: .byte 7 # 0x7
-; ALL-NEXT: .byte 8 # 0x8
-; ALL-NEXT: .byte 9 # 0x9
-; ALL-NEXT: .byte 10 # 0xa
-; ALL-NEXT: .byte 11 # 0xb
-; ALL-NEXT: .byte 12 # 0xc
-; ALL-NEXT: .byte 13 # 0xd
-; ALL-NEXT: .byte 14 # 0xe
-; ALL-NEXT: .byte 15 # 0xf
-; ALL-NOT: .byte
-
define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
-; ALL-LABEL: f32xi8_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f32xi8_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f32xi8_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f32xi8_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f32xi8_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
ret <32 x i8> %res2
}
-; ALL: .LCPI7
-; ALL-NEXT: .short 256 # 0x100
-
define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i16:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i16:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i16:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
ret <64 x i8> %res2
}
-; ALL: .LCPI8
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI8
-; AVX-NEXT: .long 50462976 # float 3.82047143E-37
-
define <64 x i8> @f64i8_i32(<64 x i8> %a) {
+; AVX-LABEL: f64i8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64i8_i32:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64i8_i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f64i8_i32:
+; AVX-64-LABEL: f64i8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64i8_i32:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64i8_i32:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+ ret <64 x i8> %res2
+}
+
+
+define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -283,43 +432,69 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
- %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
- ret <64 x i8> %res2
-}
-
-
-; ALL64: .LCPI9
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; ALL32: .LCPI9
-; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
-; AVX: .LCPI9
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
-define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i64:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i64:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f64xi8_i64:
+; AVX-64-LABEL: f64xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i64:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i64:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+ ret <64 x i8> %res2
+}
+
+
+define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -327,143 +502,184 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
- %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
- ret <64 x i8> %res2
-}
-
-
-; ALL: .LCPI10
-; ALL-NEXT: .byte 0 # 0x0
-; ALL-NEXT: .byte 1 # 0x1
-; ALL-NEXT: .byte 2 # 0x2
-; ALL-NEXT: .byte 3 # 0x3
-; ALL-NEXT: .byte 4 # 0x4
-; ALL-NEXT: .byte 5 # 0x5
-; ALL-NEXT: .byte 6 # 0x6
-; ALL-NEXT: .byte 7 # 0x7
-; ALL-NEXT: .byte 8 # 0x8
-; ALL-NEXT: .byte 9 # 0x9
-; ALL-NEXT: .byte 10 # 0xa
-; ALL-NEXT: .byte 11 # 0xb
-; ALL-NEXT: .byte 12 # 0xc
-; ALL-NEXT: .byte 13 # 0xd
-; ALL-NEXT: .byte 14 # 0xe
-; ALL-NEXT: .byte 15 # 0xf
-; ALL-NOT: .byte
-
-define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i128:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i128:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i128:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i128:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
ret <64 x i8> %res2
}
-; AVX512BW: .LCPI11
-; AVX512BW-NEXT: .byte 0 # 0x0
-; AVX512BW-NEXT: .byte 1 # 0x1
-; AVX512BW-NEXT: .byte 2 # 0x2
-; AVX512BW-NEXT: .byte 3 # 0x3
-; AVX512BW-NEXT: .byte 4 # 0x4
-; AVX512BW-NEXT: .byte 5 # 0x5
-; AVX512BW-NEXT: .byte 6 # 0x6
-; AVX512BW-NEXT: .byte 7 # 0x7
-; AVX512BW-NEXT: .byte 8 # 0x8
-; AVX512BW-NEXT: .byte 9 # 0x9
-; AVX512BW-NEXT: .byte 10 # 0xa
-; AVX512BW-NEXT: .byte 11 # 0xb
-; AVX512BW-NEXT: .byte 12 # 0xc
-; AVX512BW-NEXT: .byte 13 # 0xd
-; AVX512BW-NEXT: .byte 14 # 0xe
-; AVX512BW-NEXT: .byte 15 # 0xf
-; AVX512BW-NEXT: .byte 16 # 0x10
-; AVX512BW-NEXT: .byte 17 # 0x11
-; AVX512BW-NEXT: .byte 18 # 0x12
-; AVX512BW-NEXT: .byte 19 # 0x13
-; AVX512BW-NEXT: .byte 20 # 0x14
-; AVX512BW-NEXT: .byte 21 # 0x15
-; AVX512BW-NEXT: .byte 22 # 0x16
-; AVX512BW-NEXT: .byte 23 # 0x17
-; AVX512BW-NEXT: .byte 24 # 0x18
-; AVX512BW-NEXT: .byte 25 # 0x19
-; AVX512BW-NEXT: .byte 26 # 0x1a
-; AVX512BW-NEXT: .byte 27 # 0x1b
-; AVX512BW-NEXT: .byte 28 # 0x1c
-; AVX512BW-NEXT: .byte 29 # 0x1d
-; AVX512BW-NEXT: .byte 30 # 0x1e
-; AVX512BW-NEXT: .byte 31 # 0x1f
-; AVX512BW-NOT: .byte
-
define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; NO-AVX512BW-LABEL: f64xi8_i256:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
+;
; AVX512BW-LABEL: f64xi8_i256:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i256:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i256:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
ret <64 x i8> %res2
}
-; ALL: .LCPI12
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI12
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
%res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
ret <8 x i16> %res2
}
-; ALL64: .LCPI13
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI13
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI13
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -471,67 +687,66 @@ define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
%res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
ret <8 x i16> %res2
}
-; ALL: .LCPI14
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI14
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i32:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f16xi16_i32:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i32:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i32:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
ret <16 x i16> %res2
}
-; ALL64: .LCPI15
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI15
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI15
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f16xi16_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -540,60 +755,154 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
ret <16 x i16> %res2
}
-; ALL: .LCPI16
-; ALL-NEXT: .short 0 # 0x0
-; ALL-NEXT: .short 1 # 0x1
-; ALL-NEXT: .short 2 # 0x2
-; ALL-NEXT: .short 3 # 0x3
-; ALL-NEXT: .short 4 # 0x4
-; ALL-NEXT: .short 5 # 0x5
-; ALL-NEXT: .short 6 # 0x6
-; ALL-NEXT: .short 7 # 0x7
-; ALL-NOT: .short
-
define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f16xi16_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
ret <16 x i16> %res2
}
-; ALL: .LCPI17
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI17
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i32:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f32xi16_i32:
+; AVX-64-LABEL: f32xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i32:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i32:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+ ret <32 x i16> %res2
+}
+
+
+define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -601,43 +910,69 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
- %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
- ret <32 x i16> %res2
-}
-
-
-; ALL64: .LCPI18
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI18
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI18
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i64:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i64:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f32xi16_i64:
+; AVX-64-LABEL: f32xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i64:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i64:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+ ret <32 x i16> %res2
+}
+
+
+define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -645,87 +980,151 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
- %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
- ret <32 x i16> %res2
-}
-
-
-; ALL: .LCPI19
-; ALL-NEXT: .short 0 # 0x0
-; ALL-NEXT: .short 1 # 0x1
-; ALL-NEXT: .short 2 # 0x2
-; ALL-NEXT: .short 3 # 0x3
-; ALL-NEXT: .short 4 # 0x4
-; ALL-NEXT: .short 5 # 0x5
-; ALL-NEXT: .short 6 # 0x6
-; ALL-NEXT: .short 7 # 0x7
-; ALL-NOT: .short
-
-define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i128:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i128:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f32xi16_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i128:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i128:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
%res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
ret <32 x i16> %res2
}
-; AVX512BW: .LCPI20
-; AVX512BW-NEXT: .short 0 # 0x0
-; AVX512BW-NEXT: .short 1 # 0x1
-; AVX512BW-NEXT: .short 2 # 0x2
-; AVX512BW-NEXT: .short 3 # 0x3
-; AVX512BW-NEXT: .short 4 # 0x4
-; AVX512BW-NEXT: .short 5 # 0x5
-; AVX512BW-NEXT: .short 6 # 0x6
-; AVX512BW-NEXT: .short 7 # 0x7
-; AVX512BW-NEXT: .short 8 # 0x8
-; AVX512BW-NEXT: .short 9 # 0x9
-; AVX512BW-NEXT: .short 10 # 0xa
-; AVX512BW-NEXT: .short 11 # 0xb
-; AVX512BW-NEXT: .short 12 # 0xc
-; AVX512BW-NEXT: .short 13 # 0xd
-; AVX512BW-NEXT: .short 14 # 0xe
-; AVX512BW-NEXT: .short 15 # 0xf
-; AVX512BW-NOT: .short
-
define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; NO-AVX512BW-LABEL: f32xi16_i256:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
+;
; AVX512BW-LABEL: f32xi16_i256:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f32xi16_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i256:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i256:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
%res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
ret <32 x i16> %res2
}
-; ALL64: .LCPI21
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI21
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI21
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
+; AVX-LABEL: f4xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f4xi32_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -733,40 +1132,26 @@ define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f4xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xi32_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f4xi32_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
%res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
ret <4 x i32> %res2
}
-; ALL64: .LCPI22
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI22
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI22
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
-; ALL-LABEL: f8xi32_i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
-; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f8xi32_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -775,59 +1160,154 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xi32_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
+; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xi32_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
+; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
%res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
ret <8 x i32> %res2
}
-; ALL: .LCPI23
-; ALL-NEXT: .long 0 # 0x0
-; ALL-NEXT: .long 1 # 0x1
-; ALL-NEXT: .long 2 # 0x2
-; ALL-NEXT: .long 3 # 0x3
-; ALL-NOT: .long
-
define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
-; ALL-LABEL: f8xi32_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f8xi32_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
+; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xi32_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xi32_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
+; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xi32_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
%res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
ret <8 x i32> %res2
}
-; ALL64: .LCPI24
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI24
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI24
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
+; AVX-LABEL: f16xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xi32_i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i64:
; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xi32_i64:
+; AVX-64-LABEL: f16xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xi32_i64:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xi32_i64:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
+; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
+ %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+ %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+ ret <16 x i32> %res2
+}
+
+
+define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX-LABEL: f16xi32_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -835,51 +1315,103 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
- %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
- ret <16 x i32> %res2
-}
-
-
-; ALL: .LCPI25
-; ALL-NEXT: .long 0 # 0x0
-; ALL-NEXT: .long 1 # 0x1
-; ALL-NEXT: .long 2 # 0x2
-; ALL-NEXT: .long 3 # 0x3
-; ALL-NOT: .long
-
-define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xi32_i128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f16xi32_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xi32_i128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xi32_i128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
%res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
ret <16 x i32> %res2
}
-; ALL64: .LCPI26
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NOT: .quad
-
define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
+; AVX-LABEL: f4xi64_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f4xi64_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f4xi64_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm2
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xi64_i128:
; ALL64: # BB#0:
-; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
@@ -889,15 +1421,62 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
}
-; ALL64: .LCPI27
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NOT: .quad
-
define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
+; AVX-LABEL: f8xi64_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4
+; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xi64_i128:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
+;
+; AVX512-LABEL: f8xi64_i128:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xi64_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm3
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
; AVX2-64-LABEL: f8xi64_i128:
; AVX2-64: # BB#0:
-; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -906,57 +1485,99 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
;
; AVX512F-64-LABEL: f8xi64_i128:
; AVX512F-64: # BB#0:
-; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
-;
-; AVX512BW-64-LABEL: f8xi64_i128:
-; AVX512BW-64: # BB#0:
-; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: retq
%res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
%res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
ret <8 x i64> %res2
}
-; ALL64: .LCPI28
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NEXT: .quad 2 # 0x2
-; ALL64-NEXT: .quad 3 # 0x3
-; ALL64-NOT: .quad
-
define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
+; AVX-LABEL: f8xi64_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4
+; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xi64_i256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
+;
+; AVX512-LABEL: f8xi64_i256:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xi64_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm4
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xi64_i256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
; AVX512F-64-LABEL: f8xi64_i256:
; AVX512F-64: # BB#0:
-; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
-;
-; AVX512BW-64-LABEL: f8xi64_i256:
-; AVX512BW-64: # BB#0:
-; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: retq
%res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
%res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
ret <8 x i64> %res2
}
-; ALL: .LCPI29
-; ALL-NEXT: .quad 4575657222482165760
-
-; AVX: .LCPI29
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <4 x float> @f4xf32_f64(<4 x float> %a) {
+; AVX-LABEL: f4xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f4xf32_f64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -964,221 +1585,367 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f4xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xf32_f64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f4xf32_f64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
%res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <4 x float> %res2
}
-; ALL64: .LCPI30
-; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
-
-; ALL32: .LCPI30
-; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
-; AVX: .LCPI30
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <8 x float> @f8xf32_f64(<8 x float> %a) {
-; ALL-LABEL: f8xf32_f64:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1
-; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f8xf32_f64:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xf32_f64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xf32_f64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <8 x float> %res2
}
-; ALL: .LCPI31
-; ALL-NEXT: .long 1082130432 # float 4
-; ALL-NEXT: .long 1065353216 # float 1
-; ALL-NEXT: .long 1073741824 # float 2
-; ALL-NEXT: .long 1077936128 # float 3
-; ALL-NOT: .long
-
define <8 x float> @f8xf32_f128(<8 x float> %a) {
-; ALL-LABEL: f8xf32_f128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f8xf32_f128:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xf32_f128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xf32_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xf32_f128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
%res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
ret <8 x float> %res2
}
-; ALL64: .LCPI32
-; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
-
-; ALL32: .LCPI32
-; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
-; AVX: .LCPI32
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <16 x float> @f16xf32_f64(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xf32_f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f64:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1
+; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xf32_f64:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2
-; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f16xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f64:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f64:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <16 x float> %res2
}
-; ALL: .LCPI33
-; ALL-NEXT: .long 1082130432 # float 4
-; ALL-NEXT: .long 1065353216 # float 1
-; ALL-NEXT: .long 1073741824 # float 2
-; ALL-NEXT: .long 1077936128 # float 3
-; ALL-NOT: .long
-
define <16 x float> @f16xf32_f128(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xf32_f128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xf32_f128:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f16xf32_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
%res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
ret <16 x float> %res2
}
-; AVX512: .LCPI34
-; AVX512-NEXT: .long 1090519040 # float 8
-; AVX512-NEXT: .long 1065353216 # float 1
-; AVX512-NEXT: .long 1073741824 # float 2
-; AVX512-NEXT: .long 1077936128 # float 3
-; AVX512-NEXT: .long 1082130432 # float 4
-; AVX512-NEXT: .long 1084227584 # float 5
-; AVX512-NEXT: .long 1086324736 # float 6
-; AVX512-NEXT: .long 1088421888 # float 7
-; AVX512-NOT: .long
-
define <16 x float> @f16xf32_f256(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f16xf32_f256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
+;
; AVX512-LABEL: f16xf32_f256:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f16xf32_f256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f256:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
%res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
ret <16 x float> %res2
}
-; ALL: .LCPI35
-; ALL-NEXT: .quad 4611686018427387904 # double 2
-; ALL-NEXT: .quad 4607182418800017408 # double 1
-; ALL-NOT: .quad
-
define <4 x double> @f4xf64_f128(<4 x double> %a) {
-; ALL-LABEL: f4xf64_f128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f4xf64_f128:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f4xf64_f128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f4xf64_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f4xf64_f128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
%res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
ret <4 x double> %res2
}
-; ALL: .LCPI36
-; ALL-NEXT: .quad 4611686018427387904 # double 2
-; ALL-NEXT: .quad 4607182418800017408 # double 1
-; ALL-NOT: .quad
-
define <8 x double> @f8xf64_f128(<8 x double> %a) {
+; AVX-LABEL: f8xf64_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f8xf64_f128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xf64_f128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f8xf64_f128:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f8xf64_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xf64_f128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f8xf64_f128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
%res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
ret <8 x double> %res2
@@ -1193,11 +1960,57 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX512-NOT: .quad
define <8 x double> @f8xf64_f256(<8 x double> %a) {
+; AVX-LABEL: f8xf64_f256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xf64_f256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
+;
; AVX512-LABEL: f8xf64_f256:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xf64_f256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xf64_f256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f8xf64_f256:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
%res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
ret <8 x double> %res2
@@ -1205,32 +2018,34 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
-; ALL: .LCPI38
-; ALL-NEXT: .long 4290379776 # 0xffba0000
-
-; AVX: .LCPI38
-; AVX-NEXT: .long 4290379776 # float NaN
-
define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i32_NaN:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i32_NaN:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i32_NaN:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i32_NaN:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i32_NaN:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a
%res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1
ret <8 x i16> %res2
diff --git a/test/CodeGen/X86/bswap-wide-int.ll b/test/CodeGen/X86/bswap-wide-int.ll
index db48eb80de4b9..858dbf5fd85fe 100644
--- a/test/CodeGen/X86/bswap-wide-int.ll
+++ b/test/CodeGen/X86/bswap-wide-int.ll
@@ -71,8 +71,8 @@ define i128 @bswap_i128(i128 %a0) nounwind {
; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-MOVBE-NEXT: movbel %esi, 12(%eax)
; X86-MOVBE-NEXT: movbel %edi, 8(%eax)
-; X86-MOVBE-NEXT: movbel %ecx, 4(%eax)
-; X86-MOVBE-NEXT: movbel %edx, (%eax)
+; X86-MOVBE-NEXT: movbel %edx, 4(%eax)
+; X86-MOVBE-NEXT: movbel %ecx, (%eax)
; X86-MOVBE-NEXT: popl %esi
; X86-MOVBE-NEXT: popl %edi
; X86-MOVBE-NEXT: retl $4
diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll
index c73d7654045e4..531c6de5f90cf 100644
--- a/test/CodeGen/X86/build-vector-128.ll
+++ b/test/CodeGen/X86/build-vector-128.ll
@@ -72,12 +72,10 @@ define <4 x float> @test_buildvector_v4f32(float %a0, float %a1, float %a2, floa
}
define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
-; SSE2-32-LABEL: test_buildvector_v2i64:
-; SSE2-32: # BB#0:
-; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-32-NEXT: retl
+; SSE-32-LABEL: test_buildvector_v2i64:
+; SSE-32: # BB#0:
+; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_buildvector_v2i64:
; SSE-64: # BB#0:
@@ -86,20 +84,9 @@ define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
-; SSE41-32-LABEL: test_buildvector_v2i64:
-; SSE41-32: # BB#0:
-; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: retl
-;
; AVX-32-LABEL: test_buildvector_v2i64:
; AVX-32: # BB#0:
-; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v2i64:
diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll
index 1ced1fc3a3822..942b7779abe63 100644
--- a/test/CodeGen/X86/build-vector-256.ll
+++ b/test/CodeGen/X86/build-vector-256.ll
@@ -51,18 +51,10 @@ define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, floa
}
define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
-; AVX1-32-LABEL: test_buildvector_v4i64:
-; AVX1-32: # BB#0:
-; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-32-NEXT: retl
+; AVX-32-LABEL: test_buildvector_v4i64:
+; AVX-32: # BB#0:
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
+; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v4i64:
; AVX1-64: # BB#0:
@@ -75,19 +67,6 @@ define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-64-NEXT: retq
;
-; AVX2-32-LABEL: test_buildvector_v4i64:
-; AVX2-32: # BB#0:
-; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-32-NEXT: retl
-;
; AVX2-64-LABEL: test_buildvector_v4i64:
; AVX2-64: # BB#0:
; AVX2-64-NEXT: vmovq %rcx, %xmm0
diff --git a/test/CodeGen/X86/build-vector-512.ll b/test/CodeGen/X86/build-vector-512.ll
index 21737cca93a10..fbfbf2d53c634 100644
--- a/test/CodeGen/X86/build-vector-512.ll
+++ b/test/CodeGen/X86/build-vector-512.ll
@@ -79,25 +79,7 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, fl
define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {
; AVX-32-LABEL: test_buildvector_v8i64:
; AVX-32: # BB#0:
-; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8i64:
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index 83ab2fac2f167..260535985e2d2 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -148,7 +148,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
; SSE2-NEXT: andnps %xmm5, %xmm0
; SSE2-NEXT: orps %xmm4, %xmm0
; SSE2-NEXT: cvtps2pd %xmm0, %xmm2
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index a6bc5aa321fa5..e2a4368b255a7 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -1063,87 +1063,89 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
;
; AVX1-LABEL: _clearupper32xi8b:
; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r14
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %r8
+; AVX1-NEXT: movq %rcx, %r9
+; AVX1-NEXT: movq %rcx, %r10
+; AVX1-NEXT: movq %rcx, %r11
+; AVX1-NEXT: movq %rcx, %r14
+; AVX1-NEXT: movq %rcx, %r15
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rbx
; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %rbp
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: andb $15, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %r10
-; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movq %rcx, %rdx
; AVX1-NEXT: andb $15, %cl
; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rdx
-; AVX1-NEXT: shrq $40, %rdi
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rax
-; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrq $56, %rbp
+; AVX1-NEXT: andb $15, %bpl
+; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $48, %rsi
; AVX1-NEXT: andb $15, %sil
; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rcx
-; AVX1-NEXT: shrq $24, %r11
-; AVX1-NEXT: andb $15, %r11b
-; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rsi
-; AVX1-NEXT: shrq $16, %r9
-; AVX1-NEXT: andb $15, %r9b
-; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rdi
-; AVX1-NEXT: shrq $8, %r8
-; AVX1-NEXT: andb $15, %r8b
-; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rbx
-; AVX1-NEXT: andb $15, %r14b
-; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $8, %r10
-; AVX1-NEXT: shrq $16, %rdx
-; AVX1-NEXT: shrq $24, %rax
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: shrq $40, %rsi
-; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $56, %rbx
-; AVX1-NEXT: andb $15, %bl
-; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $40, %rdi
; AVX1-NEXT: andb $15, %dil
; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andb $15, %sil
-; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andb $15, %cl
-; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $32, %rax
; AVX1-NEXT: andb $15, %al
; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $24, %rbx
+; AVX1-NEXT: andb $15, %bl
+; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $16, %r13
+; AVX1-NEXT: andb $15, %r13b
+; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $8, %r12
+; AVX1-NEXT: andb $15, %r12b
+; AVX1-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $8, %r8
+; AVX1-NEXT: shrq $16, %r9
+; AVX1-NEXT: shrq $24, %r10
+; AVX1-NEXT: shrq $32, %r11
+; AVX1-NEXT: shrq $40, %r14
+; AVX1-NEXT: shrq $48, %r15
+; AVX1-NEXT: shrq $56, %rdx
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r15b
+; AVX1-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r14b
+; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r11b
+; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: andb $15, %r10b
; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r9b
+; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r8b
+; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: movl %eax, %ebx
-; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: shrl $8, %eax
; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shrl $24, %ebx
-; AVX1-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $16, %ebx
+; AVX1-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $24, %ebp
+; AVX1-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
; AVX1-NEXT: shrq $32, %rdi
; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX1-NEXT: shrq $40, %rsi
@@ -1153,8 +1155,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: shrq $56, %r8
-; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $8, %ecx
; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
@@ -1222,92 +1224,98 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper32xi8b:
; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %r8
+; AVX2-NEXT: movq %rcx, %r9
+; AVX2-NEXT: movq %rcx, %r10
+; AVX2-NEXT: movq %rcx, %r11
+; AVX2-NEXT: movq %rcx, %r14
+; AVX2-NEXT: movq %rcx, %r15
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rbx
; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %rbp
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: andb $15, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: andb $15, %cl
; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rdx
-; AVX2-NEXT: shrq $40, %rdi
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rax
-; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrq $56, %rbp
+; AVX2-NEXT: andb $15, %bpl
+; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $48, %rsi
; AVX2-NEXT: andb $15, %sil
; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rcx
-; AVX2-NEXT: shrq $24, %r11
-; AVX2-NEXT: andb $15, %r11b
-; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rsi
-; AVX2-NEXT: shrq $16, %r9
-; AVX2-NEXT: andb $15, %r9b
-; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rdi
-; AVX2-NEXT: shrq $8, %r8
-; AVX2-NEXT: andb $15, %r8b
-; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rbx
-; AVX2-NEXT: andb $15, %r14b
-; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $8, %r10
-; AVX2-NEXT: shrq $16, %rdx
-; AVX2-NEXT: shrq $24, %rax
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: shrq $40, %rsi
-; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $56, %rbx
-; AVX2-NEXT: andb $15, %bl
-; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $40, %rdi
; AVX2-NEXT: andb $15, %dil
; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: andb $15, %sil
-; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: andb $15, %cl
-; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $32, %rax
; AVX2-NEXT: andb $15, %al
; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $24, %rbx
+; AVX2-NEXT: andb $15, %bl
+; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $16, %r13
+; AVX2-NEXT: andb $15, %r13b
+; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $8, %r12
+; AVX2-NEXT: andb $15, %r12b
+; AVX2-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $8, %r8
+; AVX2-NEXT: shrq $16, %r9
+; AVX2-NEXT: shrq $24, %r10
+; AVX2-NEXT: shrq $32, %r11
+; AVX2-NEXT: shrq $40, %r14
+; AVX2-NEXT: shrq $48, %r15
+; AVX2-NEXT: shrq $56, %rdx
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r15b
+; AVX2-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r14b
+; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r11b
+; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: andb $15, %r10b
; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r9b
+; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r8b
+; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: movl %eax, %ebx
-; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shrl $24, %ebx
-; AVX2-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $16, %ebx
+; AVX2-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $24, %ebp
+; AVX2-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
; AVX2-NEXT: shrq $32, %rdi
; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX2-NEXT: shrq $40, %rsi
@@ -1317,8 +1325,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: shrq $56, %r8
-; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX2-NEXT: shrq $56, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %ecx
; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
@@ -1386,7 +1394,11 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
%x4 = bitcast <32 x i8> %0 to <64 x i4>
%r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index d901f16e5c73b..fca39bca6c76a 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,34 +1,36 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
-entry:
; CHECK-LABEL: test1:
-; CHECK: btl
-; CHECK-NEXT: movl $12, %eax
-; CHECK-NEXT: cmovael (%rcx), %eax
-; CHECK-NEXT: ret
-
- %0 = lshr i32 %x, %n ; <i32> [#uses=1]
- %1 = and i32 %0, 1 ; <i32> [#uses=1]
- %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: movl $12, %eax
+; CHECK-NEXT: cmovael (%rcx), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = lshr i32 %x, %n
+ %1 = and i32 %0, 1
+ %toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
- %.0 = select i1 %toBool, i32 %v, i32 12 ; <i32> [#uses=1]
+ %.0 = select i1 %toBool, i32 %v, i32 12
ret i32 %.0
}
+
define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
-entry:
; CHECK-LABEL: test2:
-; CHECK: btl
-; CHECK-NEXT: movl $12, %eax
-; CHECK-NEXT: cmovbl (%rcx), %eax
-; CHECK-NEXT: ret
-
- %0 = lshr i32 %x, %n ; <i32> [#uses=1]
- %1 = and i32 %0, 1 ; <i32> [#uses=1]
- %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: movl $12, %eax
+; CHECK-NEXT: cmovbl (%rcx), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = lshr i32 %x, %n
+ %1 = and i32 %0, 1
+ %toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
- %.0 = select i1 %toBool, i32 12, i32 %v ; <i32> [#uses=1]
+ %.0 = select i1 %toBool, i32 12, i32 %v
ret i32 %.0
}
@@ -41,10 +43,13 @@ declare void @bar(i64) nounwind
define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; CHECK-LABEL: test3:
-; CHECK: cmov{{n?}}el %[[R1:e..]], %[[R2:e..]]
-; CHECK-NOT: movl
-; CHECK: call
-
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testb $1, %dl
+; CHECK-NEXT: cmovel %esi, %edi
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
%c = trunc i64 %a to i32
%d = trunc i64 %b to i32
%e = select i1 %p, i32 %c, i32 %d
@@ -65,52 +70,86 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; PR4814
-@g_3 = external global i8 ; <i8*> [#uses=1]
-@g_96 = external global i8 ; <i8*> [#uses=2]
-@g_100 = external global i8 ; <i8*> [#uses=2]
-@_2E_str = external constant [15 x i8], align 1 ; <[15 x i8]*> [#uses=1]
+@g_3 = external global i8
+@g_96 = external global i8
+@g_100 = external global i8
+@_2E_str = external constant [15 x i8], align 1
define i1 @test4() nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movsbl {{.*}}(%rip), %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrb $7, %al
+; CHECK-NEXT: movzbl %al, %ecx
+; CHECK-NEXT: xorl $1, %ecx
+; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; CHECK-NEXT: sarl %cl, %edx
+; CHECK-NEXT: movb {{.*}}(%rip), %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB3_2
+; CHECK-NEXT: # BB#1: # %bb.i.i.i
+; CHECK-NEXT: movb {{.*}}(%rip), %cl
+; CHECK-NEXT: .LBB3_2: # %func_4.exit.i
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: je .LBB3_4
+; CHECK-NEXT: # BB#3: # %func_4.exit.i
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .LBB3_4: # %func_4.exit.i
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB3_7
+; CHECK-NEXT: # BB#5: # %func_4.exit.i
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: jne .LBB3_7
+; CHECK-NEXT: # BB#6: # %bb.i.i
+; CHECK-NEXT: movb {{.*}}(%rip), %cl
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: .LBB3_7: # %func_1.exit
+; CHECK-NEXT: movb %cl, {{.*}}(%rip)
+; CHECK-NEXT: movzbl %cl, %esi
+; CHECK-NEXT: movl $_2E_str, %edi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: callq printf
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
entry:
- %0 = load i8, i8* @g_3, align 1 ; <i8> [#uses=2]
- %1 = sext i8 %0 to i32 ; <i32> [#uses=1]
- %.lobit.i = lshr i8 %0, 7 ; <i8> [#uses=1]
- %tmp.i = zext i8 %.lobit.i to i32 ; <i32> [#uses=1]
- %tmp.not.i = xor i32 %tmp.i, 1 ; <i32> [#uses=1]
- %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i ; <i32> [#uses=1]
- %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8 ; <i8> [#uses=1]
- %2 = icmp eq i8 %retval56.i.i, 0 ; <i1> [#uses=2]
- %g_96.promoted.i = load i8, i8* @g_96 ; <i8> [#uses=3]
- %3 = icmp eq i8 %g_96.promoted.i, 0 ; <i1> [#uses=2]
+ %0 = load i8, i8* @g_3, align 1
+ %1 = sext i8 %0 to i32
+ %.lobit.i = lshr i8 %0, 7
+ %tmp.i = zext i8 %.lobit.i to i32
+ %tmp.not.i = xor i32 %tmp.i, 1
+ %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i
+ %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8
+ %2 = icmp eq i8 %retval56.i.i, 0
+ %g_96.promoted.i = load i8, i8* @g_96
+ %3 = icmp eq i8 %g_96.promoted.i, 0
br i1 %3, label %func_4.exit.i, label %bb.i.i.i
-bb.i.i.i: ; preds = %entry
- %4 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0]
+bb.i.i.i:
+ %4 = load volatile i8, i8* @g_100, align 1
br label %func_4.exit.i
-; CHECK-LABEL: test4:
-; CHECK: g_100
-; CHECK: testb
-; CHECK-NOT: xor
-; CHECK: setne
-; CHECK: testb
-
-func_4.exit.i: ; preds = %bb.i.i.i, %entry
- %.not.i = xor i1 %2, true ; <i1> [#uses=1]
- %brmerge.i = or i1 %3, %.not.i ; <i1> [#uses=1]
- %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0 ; <i8> [#uses=1]
+func_4.exit.i:
+ %.not.i = xor i1 %2, true
+ %brmerge.i = or i1 %3, %.not.i
+ %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0
br i1 %brmerge.i, label %func_1.exit, label %bb.i.i
-bb.i.i: ; preds = %func_4.exit.i
- %5 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0]
+bb.i.i:
+ %5 = load volatile i8, i8* @g_100, align 1
br label %func_1.exit
-func_1.exit: ; preds = %bb.i.i, %func_4.exit.i
- %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] ; <i8> [#uses=2]
+func_1.exit:
+ %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ]
%ret = phi i1 [ 0, %bb.i.i ], [ %.not.i, %func_4.exit.i ]
store i8 %g_96.tmp.0.i, i8* @g_96
- %6 = zext i8 %g_96.tmp.0.i to i32 ; <i32> [#uses=1]
- %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
+ %6 = zext i8 %g_96.tmp.0.i to i32
+ %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind
ret i1 %ret
}
@@ -120,29 +159,32 @@ declare i32 @printf(i8* nocapture, ...) nounwind
; Should compile to setcc | -2.
; rdar://6668608
define i32 @test5(i32* nocapture %P) nounwind readonly {
-entry:
; CHECK-LABEL: test5:
-; CHECK: xorl %eax, %eax
-; CHECK: setg %al
-; CHECK: orl $-2, %eax
-; CHECK: ret
-
- %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
- %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1]
- %iftmp.0.0 = select i1 %1, i32 -1, i32 -2 ; <i32> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $41, (%rdi)
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: orl $-2, %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* %P, align 4
+ %1 = icmp sgt i32 %0, 41
+ %iftmp.0.0 = select i1 %1, i32 -1, i32 -2
ret i32 %iftmp.0.0
}
define i32 @test6(i32* nocapture %P) nounwind readonly {
-entry:
; CHECK-LABEL: test6:
-; CHECK: xorl %eax, %eax
-; CHECK: setl %al
-; CHECK: leal 4(%rax,%rax,8), %eax
-; CHECK: ret
- %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
- %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1]
- %iftmp.0.0 = select i1 %1, i32 4, i32 13 ; <i32> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $42, (%rdi)
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: leal 4(%rax,%rax,8), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* %P, align 4
+ %1 = icmp sgt i32 %0, 41
+ %iftmp.0.0 = select i1 %1, i32 4, i32 13
ret i32 %iftmp.0.0
}
@@ -151,16 +193,21 @@ entry:
; because it isn't worth it. Just use a branch instead.
define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
; CHECK-LABEL: test7:
-; CHECK: testb $1, %dil
-; CHECK-NEXT: jne LBB
-
+; CHECK: # BB#0:
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: jne .LBB6_2
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: movb %dl, %sil
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
%d = select i1 %c, i8 %a, i8 %b
ret i8 %d
}
define i32 @smin(i32 %x) {
; CHECK-LABEL: smin:
-; CHECK: ## BB#0:
+; CHECK: # BB#0:
; CHECK-NEXT: xorl $-1, %edi
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: cmovsl %edi, %eax
diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
index d7dc8defac3ae..875d791dc8021 100644
--- a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
+++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -37,7 +37,7 @@ end:
ret void
}
-define void @nested_loop_0() !prof !1 {
+define void @nested_loop_0(i1 %flag) !prof !1 {
; Test if a block that is cold in the inner loop but not cold in the outer loop
; will merged to the outer loop chain.
;
@@ -68,8 +68,7 @@ if.then:
if.else:
call void @e()
- %call2 = call zeroext i1 @a()
- br i1 %call2, label %header2, label %header, !prof !3
+ br i1 %flag, label %header2, label %header, !prof !3
end:
call void @f()
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
index 64e081523c1f4..811b1f20833c9 100644
--- a/test/CodeGen/X86/combine-avx-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -1,59 +1,56 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
ret <4 x double> %1
}
-; CHECK-LABEL: test_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
ret <8 x float> %1
}
-; CHECK-LABEL: test_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test2_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
ret <4 x double> %1
}
-; CHECK-LABEL: test2_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test2_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
ret <8 x float> %1
}
-; CHECK-LABEL: test2_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test3_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
ret <4 x double> %1
}
-; CHECK-LABEL: test3_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test3_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
ret <8 x float> %1
}
-; CHECK-LABEL: test3_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
index 2714b26c91414..9a548f6b7f0eb 100644
--- a/test/CodeGen/X86/combine-avx2-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -1,88 +1,83 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s
; Verify that the backend correctly combines AVX2 builtin intrinsics.
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
ret <16 x i16> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
ret <4 x i32> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
ret <8 x i32> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
ret <16 x i16> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
ret <4 x i32> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
ret <8 x i32> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
ret <16 x i16> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
ret <4 x i32> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
ret <8 x i32> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll
new file mode 100644
index 0000000000000..713ee5d0f65a9
--- /dev/null
+++ b/test/CodeGen/X86/combine-rotates.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512
+
+; fold (rot (rot x, c1), c2) -> rot x, c1+c2
+define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot:
+; XOP: # BB#0:
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+ %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15>
+ %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
+
+define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot_splat:
+; XOP: # BB#0:
+; XOP-NEXT: vprotd $7, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot_splat:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $29, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $10, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+ %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22>
+ %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
+
+define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot_splat_zero:
+; XOP: # BB#0:
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot_splat_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31>
+ %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 1916883c201b6..0c8e7b317ec6f 100644
--- a/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -1,89 +1,81 @@
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s
define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
ret <2 x double> %1
}
-; CHECK-LABEL: test_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: ret
-
define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
ret <4 x float> %1
}
-; CHECK-LABEL: test_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: ret
-
define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
ret <8 x i16> %1
}
-; CHECK-LABEL: test_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: ret
-
define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test2_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
ret <2 x double> %1
}
-; CHECK-LABEL: test2_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test2_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
ret <4 x float> %1
}
-; CHECK-LABEL: test2_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test2_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
ret <8 x i16> %1
}
-; CHECK-LABEL: test2_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
+; CHECK-LABEL: test3_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
ret <2 x double> %1
}
-; CHECK-LABEL: test3_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: ret
-
define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
+; CHECK-LABEL: test3_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
ret <4 x float> %1
}
-; CHECK-LABEL: test3_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: ret
-
define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
+; CHECK-LABEL: test3_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
ret <8 x i16> %1
}
-; CHECK-LABEL: test3_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: ret
-
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll
index 83589b7706f75..d73f7163fd87b 100644
--- a/test/CodeGen/X86/constant-hoisting-bfi.ll
+++ b/test/CodeGen/X86/constant-hoisting-bfi.ll
@@ -4,13 +4,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Check when BFI is enabled for constant hoisting, constant 214748364701
; will not be hoisted to the func entry.
-; CHECK-LABEL: @foo(
+; CHECK-LABEL: @test1(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: if.then:
; Function Attrs: norecurse nounwind uwtable
-define i64 @foo(i64* nocapture %a) {
+define i64 @test1(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
@@ -52,7 +52,7 @@ return: ; preds = %if.else5, %if.then,
; in while.body will be hoisted to while.body.preheader. 214748364701 in
; if.then16 and if.else10 will be merged and hoisted to the beginning of
; if.else10 because if.else10 dominates if.then16.
-; CHECK-LABEL: @goo(
+; CHECK-LABEL: @test2(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: while.body.preheader:
@@ -61,7 +61,7 @@ return: ; preds = %if.else5, %if.then,
; CHECK: if.else10:
; CHECK-NEXT: bitcast i64 214748364701 to i64
; CHECK-NOT: bitcast i64 214748364701 to i64
-define i64 @goo(i64* nocapture %a) {
+define i64 @test2(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
@@ -113,3 +113,47 @@ return: ; preds = %while.cond.preheade
}
!0 = !{!"branch_weights", i32 1, i32 2000}
+
+; 214748364701 will be hoisted to entry block to reduce code size.
+; CHECK-LABEL: @test3(
+; CHECK: entry:
+; CHECK-NEXT: %const = bitcast i64 214748364701 to i64
+define i64 @test3(i64 %t0) {
+entry:
+ %cmp = icmp ult i64 %t0, 56
+ br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK-NOT: %const = bitcast i64 214748364701 to i64
+if.then:
+ %add1 = add i64 %t0, 214748364701
+ br label %return
+
+; CHECK: if.else:
+; CHECK-NOT: %const = bitcast i64 214748364701 to i64
+if.else:
+ %add2 = add i64 %t0, 214748364701
+ br label %return
+
+return:
+ %retval = phi i64 [ %add1, %if.then ], [ %add2, %if.else ]
+ ret i64 %retval
+}
+
+; 214748364701 will not be hoisted to entry block because it will only
+; increase its live range.
+; CHECK-LABEL: @test4(
+; CHECK: nextblock:
+; CHECK-NEXT: %add1 = add i64 %t0, 214748364701
+define i64 @test4(i64 %t0) {
+entry:
+ %cmp = icmp ult i64 %t0, 56
+ br label %nextblock
+
+nextblock:
+ %add1 = add i64 %t0, 214748364701
+ br label %return
+
+return:
+ ret i64 %add1
+}
diff --git a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
index 9dd184c8ab316..88778b317b97a 100644
--- a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
+++ b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
@@ -62,4 +62,128 @@ define void @test_memcpy_args(i8** %Storage) {
call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void
}
+define i8* @test_memmove1(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 1)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $1, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_1
+}
+
+define i8* @test_memmove2(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove2
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 2, i32 2)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $2, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_2
+}
+
+define i8* @test_memmove4(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove4
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 4, i32 4)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_4
+}
+
+define i8* @test_memmove8(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove8
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %P, i8* align 8 %Q, i32 8, i32 8)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $8, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_8
+}
+
+define i8* @test_memmove16(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove16
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %P, i8* align 16 %Q, i32 16, i32 16)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $16, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_16
+}
+
+define void @test_memmove_args(i8** %Storage) {
+ ; CHECK: test_memmove_args
+ %Dst = load i8*, i8** %Storage
+ %Src.addr = getelementptr i8*, i8** %Storage, i64 1
+ %Src = load i8*, i8** %Src.addr
+
+ ; 1st arg (%rdi)
+ ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]]
+ ; CHECK-DAG: movq [[REG1]], %rdi
+ ; 2nd arg (%rsi)
+ ; CHECK-DAG: movq 8(%rdi), %rsi
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_4
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void
+}
+
+define i8* @test_memset1(i8* %P, i8 %V) {
+ ; CHECK: test_memset
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 1)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $1, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_1
+}
+
+define i8* @test_memset2(i8* %P, i8 %V) {
+ ; CHECK: test_memset2
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 2, i32 2)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $2, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_2
+}
+
+define i8* @test_memset4(i8* %P, i8 %V) {
+ ; CHECK: test_memset4
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 4, i32 4)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_4
+}
+
+define i8* @test_memset8(i8* %P, i8 %V) {
+ ; CHECK: test_memset8
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 8 %P, i8 %V, i32 8, i32 8)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $8, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_8
+}
+
+define i8* @test_memset16(i8* %P, i8 %V) {
+ ; CHECK: test_memset16
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 16 %P, i8 %V, i32 16, i32 16)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $16, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_16
+}
+
+define void @test_memset_args(i8** %Storage, i8* %V) {
+ ; CHECK: test_memset_args
+ %Dst = load i8*, i8** %Storage
+ %Val = load i8, i8* %V
+
+ ; 1st arg (%rdi)
+ ; CHECK-DAG: movq (%rdi), %rdi
+ ; 2nd arg (%rsi)
+ ; CHECK-DAG: movzbl (%rsi), %esi
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_4
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %Dst, i8 %Val, i32 4, i32 4) ret void
+}
+
declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture, i8, i32, i32) nounwind
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
index 48cb8d70b9748..4ea6b7801fb31 100644
--- a/test/CodeGen/X86/extract-store.ll
+++ b/test/CodeGen/X86/extract-store.ll
@@ -345,7 +345,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
; SSE-X32-LABEL: extract_i64_1:
; SSE-X32: # BB#0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE-X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-X32-NEXT: movq %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 5d5cbc76f92ee..4d0b5ccc16b0e 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah | FileCheck %s
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
@@ -6,31 +7,31 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
; into loads, off the stack or a previous store.
; Be very explicit about the ordering/stack offsets.
-; CHECK-LABEL: test_extractelement_legalization_storereuse:
-; CHECK: # BB#0
-; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movl 16(%esp), %eax
-; CHECK-NEXT: movl 24(%esp), %ecx
-; CHECK-NEXT: movl 20(%esp), %edx
-; CHECK-NEXT: paddd (%edx), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: popl %ebx
-; CHECK-NEXT: retl
-
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
+; CHECK-LABEL: test_extractelement_legalization_storereuse:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: paddd (%ecx), %xmm0
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movdqa %xmm0, (%ecx)
+; CHECK-NEXT: movl (%ecx), %esi
+; CHECK-NEXT: movl 4(%ecx), %edi
+; CHECK-NEXT: shll $4, %edx
+; CHECK-NEXT: movl 8(%ecx), %ebx
+; CHECK-NEXT: movl 12(%ecx), %ecx
+; CHECK-NEXT: movl %esi, 12(%eax,%edx)
+; CHECK-NEXT: movl %edi, (%eax,%edx)
+; CHECK-NEXT: movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT: movl %ecx, 4(%eax,%edx)
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = bitcast i32* %y to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 16
diff --git a/test/CodeGen/X86/fast-isel-abort-warm.ll b/test/CodeGen/X86/fast-isel-abort-warm.ll
index 3caa91b11ec69..e87d14bb28ade 100644
--- a/test/CodeGen/X86/fast-isel-abort-warm.ll
+++ b/test/CodeGen/X86/fast-isel-abort-warm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback 2>&1 | FileCheck %s
+; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback -pass-remarks-missed=isel 2>&1 | FileCheck %s
; Make sure FastISel report a warming when we asked it to do so.
; Note: This test needs to use whatever is not supported by FastISel.
; Thus, this test may fail because inline asm gets supported in FastISel.
@@ -6,9 +6,26 @@
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx"
+; CHECK: remark: <unknown>:0:0: FastISel missed call: call void asm sideeffect
; CHECK: warning: Instruction selection used fallback path for foo
define void @foo(){
entry:
call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"()
ret void
}
+
+; CHECK: remark: <unknown>:0:0: FastISel missed: store i128
+; CHECK: warning: Instruction selection used fallback path for test_instruction_fallback
+define void @test_instruction_fallback(i128* %ptr){
+ %v1 = load i128, i128* %ptr
+ %result = add i128 %v1, %v1
+ store i128 %result, i128 * %ptr
+ ret void
+}
+
+; CHECK-NOT: remark: <unknown>:0:0: FastISel missed
+; CHECK-NOT: warning: Instruction selection used fallback path for test_instruction_not_fallback
+define i32 @test_instruction_not_fallback(i32 %a){
+ %result = add i32 %a, %a
+ ret i32 %result
+}
diff --git a/test/CodeGen/X86/fast-isel-gc-intrinsics.ll b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll
new file mode 100644
index 0000000000000..bf08ad01d7d8c
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -fast-isel
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+; Dont crash with gc intrinsics.
+
+; gcrelocate call should not be an LLVM Machine Block by itself.
+define i8 addrspace(1)* @test_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7)
+ ret i8 addrspace(1)* %vnew
+}
+
+; gcresult calls are fine in their own blocks.
+define i1 @test_gcresult() gc "statepoint-example" {
+entry:
+ %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+ %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+ ret i1 %call1
+}
+
+; we are okay here because we see the gcrelocate and avoid generating their own
+; block.
+define i1 @test_gcresult_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+ %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7)
+ ret i1 %call1
+}
+
+define i8 addrspace(1)* @test_non_entry_block(i8 addrspace(1)* %v, i8 %val) gc "statepoint-example" {
+entry:
+ %load = load i8, i8 addrspace(1)* %v
+ %cmp = icmp eq i8 %load, %val
+ br i1 %cmp, label %func_call, label %exit
+
+func_call:
+ call void @dummy()
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7)
+ ret i8 addrspace(1)* %vnew
+
+exit:
+ ret i8 addrspace(1)* %v
+
+}
+
+declare void @dummy()
+declare void @foo()
+
+declare zeroext i1 @return_i1()
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(token)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
diff --git a/test/CodeGen/X86/fastisel-softfloat.ll b/test/CodeGen/X86/fastisel-softfloat.ll
new file mode 100644
index 0000000000000..e4330db81e1ab
--- /dev/null
+++ b/test/CodeGen/X86/fastisel-softfloat.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @pr26522(float %pat) #0 {
+; CHECK-LABEL: pr26522:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ ret float %pat
+}
+
+attributes #0 = { noinline optnone "target-features"="+soft-float" }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 6c6bc8bdc1d13..98082ec611d49 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index c3109673468ec..e09ad3e4e0b85 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -16,10 +16,10 @@
; LIN: sarq $32, %r[[REG2]]
; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
; LIN: sarq $32, %r[[REG4]]
-; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0
-; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0
-; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
-; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
+; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movq %rdi, %xmm1
+; LIN: movq %r[[REG3]], %xmm0
; WIN: movdqa (%rdx), %xmm0
; WIN: pand (%r8), %xmm0
@@ -29,10 +29,10 @@
; WIN: sarq $32, %r[[REG2]]
; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
; WIN: sarq $32, %r[[REG4]]
-; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0
-; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0
-; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
-; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
+; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movdqa (%r[[REG2]]), %xmm0
+; WIN: movq %r[[REG2]], %xmm1
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
%a = load <4 x i32>, <4 x i32>* %i
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 4c8003f0c516e..b7c43d3b2e3ec 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -1,266 +1,833 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN: | FileCheck %s -check-prefix=CHECK-I686
-
-define void @test_load_store(half* %in, half* %out) {
-; CHECK-LABEL: test_load_store:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
-; CHECK: movw %ax, (%rsi)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,BWON,BWON-F16C
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefixes=CHECK-I686
+
+define void @test_load_store(half* %in, half* %out) #0 {
+; BWON-LABEL: test_load_store:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: movw %ax, (%rsi)
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_load_store:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: movw %ax, (%rsi)
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_load_store:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw (%ecx), %cx
+; CHECK-I686-NEXT: movw %cx, (%eax)
+; CHECK-I686-NEXT: retl
%val = load half, half* %in
store half %val, half* %out
ret void
}
-define i16 @test_bitcast_from_half(half* %addr) {
-; CHECK-LABEL: test_bitcast_from_half:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
+define i16 @test_bitcast_from_half(half* %addr) #0 {
+; BWON-LABEL: test_bitcast_from_half:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_bitcast_from_half:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_from_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movw (%eax), %ax
+; CHECK-I686-NEXT: retl
%val = load half, half* %addr
%val_int = bitcast half %val to i16
ret i16 %val_int
}
-define void @test_bitcast_to_half(half* %addr, i16 %in) {
+define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
; CHECK-LABEL: test_bitcast_to_half:
-; CHECK: movw %si, (%rdi)
+; CHECK: # BB#0:
+; CHECK-NEXT: movw %si, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_to_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw %ax, (%ecx)
+; CHECK-I686-NEXT: retl
%val_fp = bitcast i16 %in to half
store half %val_fp, half* %addr
ret void
}
-define float @test_extend32(half* %addr) {
-; CHECK-LABEL: test_extend32:
-
-; CHECK-LIBCALL: jmp __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
+define float @test_extend32(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL
+;
+; BWON-F16C-LABEL: test_extend32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to float
ret float %val32
}
-define double @test_extend64(half* %addr) {
-; CHECK-LABEL: test_extend64:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtss2sd
+define double @test_extend64(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to double
ret double %val32
}
-define void @test_trunc32(float %in, half* %addr) {
-; CHECK-LABEL: test_trunc32:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
+define void @test_trunc32(float %in, half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_trunc32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rdi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc float %in to half
store half %val16, half* %addr
ret void
}
-define void @test_trunc64(double %in, half* %addr) {
+define void @test_trunc64(double %in, half* %addr) #0 {
; CHECK-LABEL: test_trunc64:
-
-; CHECK-LIBCALL: callq __truncdfhf2
-; CHECK-F16C: callq __truncdfhf2
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: callq __truncdfhf2
+; CHECK-NEXT: movw %ax, (%rbx)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movsd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc double %in to half
store half %val16, half* %addr
ret void
}
define i64 @test_fptosi_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptosi_i64:
-
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptosi_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_fptosi_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptosi_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixsfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptosi half %a to i64
ret i64 %r
}
define void @test_sitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_sitofp_i64:
-
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK_LIBCALL-NEXT: popq [[ADDR]]
-; CHECK_LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_sitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_sitofp_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = sitofp i64 %a to half
store half %r, half* %p
ret void
}
define i64 @test_fptoui_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptoui_i64:
-
-; FP_TO_UINT is expanded using FP_TO_SINT
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss {{.[A-Z_0-9]+}}(%rip), [[REG1:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movaps %xmm0, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: subss [[REG1]], [[REG2]]
-; CHECK-LIBCALL-NEXT: cvttss2si [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, [[REG4:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: xorq [[REG3]], [[REG4]]
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0
-; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]]
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovss {{.[A-Z_0-9]+}}(%rip), [[REG3:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vsubss [[REG3]], [[REG2]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG4]], [[REG5:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: movabsq $-9223372036854775808, [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: xorq [[REG5]], [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], [[REG7:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vucomiss [[REG3]], [[REG2]]
-; CHECK-F16C-NEXT: cmovaeq [[REG6]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptoui_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2
+; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rax
+; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-LIBCALL-NEXT: xorq %rax, %rcx
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT: cmovaeq %rcx, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_fptoui_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2
+; BWON-F16C-NEXT: vcvttss2si %xmm2, %rax
+; BWON-F16C-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; BWON-F16C-NEXT: xorq %rax, %rcx
+; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax
+; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0
+; BWON-F16C-NEXT: cmovaeq %rcx, %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptoui_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixunssfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptoui half %a to i64
ret i64 %r
}
define void @test_uitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]]
-
-; simple conversion to float if non-negative
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]]
-; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]]
-
-; convert using shift+or if negative
-; CHECK-NEXT: [[LABEL1]]:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]]
-; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]]
-; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]]
-
-; convert float to half
-; CHECK-NEXT: [[LABEL2]]:
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-NEXT: retq
-
+; CHECK-LIBCALL-LABEL: test_uitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT: js .LBB10_1
+; CHECK-LIBCALL-NEXT: # BB#2:
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: jmp .LBB10_3
+; CHECK-LIBCALL-NEXT: .LBB10_1:
+; CHECK-LIBCALL-NEXT: movq %rdi, %rax
+; CHECK-LIBCALL-NEXT: shrq %rax
+; CHECK-LIBCALL-NEXT: andl $1, %edi
+; CHECK-LIBCALL-NEXT: orq %rax, %rdi
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: .LBB10_3:
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_uitofp_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: testq %rdi, %rdi
+; BWON-F16C-NEXT: js .LBB10_1
+; BWON-F16C-NEXT: # BB#2:
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: jmp .LBB10_3
+; BWON-F16C-NEXT: .LBB10_1:
+; BWON-F16C-NEXT: movq %rdi, %rax
+; BWON-F16C-NEXT: shrq %rax
+; BWON-F16C-NEXT: andl $1, %edi
+; BWON-F16C-NEXT: orq %rax, %rdi
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: .LBB10_3:
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_uitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: xorl %eax, %eax
+; CHECK-I686-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: setns %al
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = uitofp i64 %a to half
store half %r, half* %p
ret void
}
define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
+; CHECK-LIBCALL-LABEL: test_extend32_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $48, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: addq $48, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend32_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl 6(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: movswl 2(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $56, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-I686-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT: addl $56, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x float>
ret <4 x float> %b
}
define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend64_vec4
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C: vcvtss2sd
+; CHECK-LIBCALL-LABEL: test_extend64_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend64_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl 2(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: movswl 6(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $88, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-I686-NEXT: addl $88, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x double>
ret <4 x double> %b
}
-define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
+; BWON-NOF16C-LABEL: test_trunc32_vec4:
+; BWON-NOF16C: # BB#0:
+; BWON-NOF16C-NEXT: pushq %rbp
+; BWON-NOF16C-NEXT: pushq %r15
+; BWON-NOF16C-NEXT: pushq %r14
+; BWON-NOF16C-NEXT: pushq %rbx
+; BWON-NOF16C-NEXT: subq $24, %rsp
+; BWON-NOF16C-NEXT: movq %rdi, %rbx
+; BWON-NOF16C-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %r14d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %r15d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %ebp
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movw %ax, (%rbx)
+; BWON-NOF16C-NEXT: movw %bp, 6(%rbx)
+; BWON-NOF16C-NEXT: movw %r15w, 4(%rbx)
+; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT: addq $24, %rsp
+; BWON-NOF16C-NEXT: popq %rbx
+; BWON-NOF16C-NEXT: popq %r14
+; BWON-NOF16C-NEXT: popq %r15
+; BWON-NOF16C-NEXT: popq %rbp
+; BWON-NOF16C-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc32_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $24, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, (%rbx)
+; BWOFF-NEXT: movw %bp, 6(%rbx)
+; BWOFF-NEXT: movw %r15w, 4(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $24, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc32_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %eax
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %ecx
+; BWON-F16C-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %edx
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %esi
+; BWON-F16C-NEXT: movw %si, (%rdi)
+; BWON-F16C-NEXT: movw %dx, 6(%rdi)
+; BWON-F16C-NEXT: movw %cx, 4(%rdi)
+; BWON-F16C-NEXT: movw %ax, 2(%rdi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $44, %esp
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movaps %xmm0, %xmm1
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-I686-NEXT: movss %xmm1, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%ebp)
+; CHECK-I686-NEXT: movw %bx, 6(%ebp)
+; CHECK-I686-NEXT: movw %di, 4(%ebp)
+; CHECK-I686-NEXT: movw %si, 2(%ebp)
+; CHECK-I686-NEXT: addl $44, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x float> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
}
-define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc64_vec4:
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
+; BWON-NOF16C-LABEL: test_trunc64_vec4:
+; BWON-NOF16C: # BB#0:
+; BWON-NOF16C-NEXT: pushq %rbp
+; BWON-NOF16C-NEXT: pushq %r15
+; BWON-NOF16C-NEXT: pushq %r14
+; BWON-NOF16C-NEXT: pushq %rbx
+; BWON-NOF16C-NEXT: subq $40, %rsp
+; BWON-NOF16C-NEXT: movq %rdi, %rbx
+; BWON-NOF16C-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %r14d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %r15d
+; BWON-NOF16C-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %ebp
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movw %ax, 4(%rbx)
+; BWON-NOF16C-NEXT: movw %bp, (%rbx)
+; BWON-NOF16C-NEXT: movw %r15w, 6(%rbx)
+; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT: addq $40, %rsp
+; BWON-NOF16C-NEXT: popq %rbx
+; BWON-NOF16C-NEXT: popq %r14
+; BWON-NOF16C-NEXT: popq %r15
+; BWON-NOF16C-NEXT: popq %rbp
+; BWON-NOF16C-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc64_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $40, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, 4(%rbx)
+; BWOFF-NEXT: movw %bp, (%rbx)
+; BWOFF-NEXT: movw %r15w, 6(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $40, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc64_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: pushq %rbp
+; BWON-F16C-NEXT: pushq %r15
+; BWON-F16C-NEXT: pushq %r14
+; BWON-F16C-NEXT: pushq %rbx
+; BWON-F16C-NEXT: subq $88, %rsp
+; BWON-F16C-NEXT: movq %rdi, %rbx
+; BWON-F16C-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %r14d
+; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0
+; BWON-F16C-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %r15d
+; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %ebp
+; BWON-F16C-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movw %ax, 4(%rbx)
+; BWON-F16C-NEXT: movw %bp, (%rbx)
+; BWON-F16C-NEXT: movw %r15w, 6(%rbx)
+; BWON-F16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-F16C-NEXT: addq $88, %rsp
+; BWON-F16C-NEXT: popq %rbx
+; BWON-F16C-NEXT: popq %r14
+; BWON-F16C-NEXT: popq %r15
+; BWON-F16C-NEXT: popq %rbp
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $60, %esp
+; CHECK-I686-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, 6(%ebp)
+; CHECK-I686-NEXT: movw %bx, 4(%ebp)
+; CHECK-I686-NEXT: movw %di, 2(%ebp)
+; CHECK-I686-NEXT: movw %si, (%ebp)
+; CHECK-I686-NEXT: addl $60, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x double> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
@@ -272,40 +839,98 @@ declare float @test_floatret();
; to f80 and then rounded to f32. The DAG combiner should not combine this
; fp_round and the subsequent fptrunc from float to half.
define half @test_f80trunc_nodagcombine() #0 {
-; CHECK-LABEL: test_f80trunc_nodagcombine:
-; CHECK-I686-NOT: calll __truncxfhf2
+; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: callq test_floatret
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_f80trunc_nodagcombine:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: pushq %rax
+; BWON-F16C-NEXT: callq test_floatret
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: popq %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: calll test_floatret
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%1 = call float @test_floatret()
%2 = fptrunc float %1 to half
ret half %2
}
-; CHECK-LABEL: test_sitofp_fadd_i32:
-; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movl %edi, %ebx
-; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
-; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: addq $16, %rsp
-; CHECK-LIBCALL-NEXT: popq %rbx
-; CHECK-LIBCALL-NEXT: retq
-; CHECK-F16C-NEXT: movswl (%rsi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
-; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-F16C-NEXT: retq
define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movl %edi, %ebx
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_sitofp_fadd_i32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rsi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_fadd_i32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $28, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-I686-NEXT: xorps %xmm0, %xmm0
+; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload
+; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: addl $28, %esp
+; CHECK-I686-NEXT: retl
%tmp0 = load half, half* %b
%tmp1 = sitofp i32 %a to half
%tmp2 = fadd half %tmp0, %tmp1
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index ceb4657119065..5425670fbb1ed 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -1,17 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
define void @i24_or(i24* %a) {
-; CHECK-LABEL: i24_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: movzbl 2(%rdi), %ecx
-; CHECK-NEXT: movb %cl, 2(%rdi)
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: orl $384, %ecx # imm = 0x180
-; CHECK-NEXT: movw %cx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl $384, %eax # imm = 0x180
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movzbl 2(%rdi), %ecx
+; X64-NEXT: movb %cl, 2(%rdi)
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: orl $384, %ecx # imm = 0x180
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%aa = load i24, i24* %a, align 1
%b = or i24 %aa, 384
store i24 %b, i24* %a, align 1
@@ -19,17 +32,30 @@ define void @i24_or(i24* %a) {
}
define void @i24_and_or(i24* %a) {
-; CHECK-LABEL: i24_and_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: movzbl 2(%rdi), %ecx
-; CHECK-NEXT: movb %cl, 2(%rdi)
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: orl $384, %ecx # imm = 0x180
-; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
-; CHECK-NEXT: movw %cx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_and_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl $384, %eax # imm = 0x180
+; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_and_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movzbl 2(%rdi), %ecx
+; X64-NEXT: movb %cl, 2(%rdi)
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: orl $384, %ecx # imm = 0x180
+; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%b = load i24, i24* %a, align 1
%c = and i24 %b, -128
%d = or i24 %c, 384
@@ -38,19 +64,40 @@ define void @i24_and_or(i24* %a) {
}
define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
-; CHECK-LABEL: i24_insert_bit:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: movzwl (%rdi), %ecx
-; CHECK-NEXT: movzbl 2(%rdi), %edx
-; CHECK-NEXT: movb %dl, 2(%rdi)
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: shll $13, %eax
-; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF
-; CHECK-NEXT: orl %eax, %edx
-; CHECK-NEXT: movw %dx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_insert_bit:
+; X86: # BB#0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .Lcfi0:
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .Lcfi1:
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl (%ecx), %esi
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: shll $13, %edx
+; X86-NEXT: andl $16769023, %eax # imm = 0xFFDFFF
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_insert_bit:
+; X64: # BB#0:
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzwl (%rdi), %ecx
+; X64-NEXT: movzbl 2(%rdi), %edx
+; X64-NEXT: movb %dl, 2(%rdi)
+; X64-NEXT: shll $16, %edx
+; X64-NEXT: orl %ecx, %edx
+; X64-NEXT: shll $13, %eax
+; X64-NEXT: andl $16769023, %edx # imm = 0xFFDFFF
+; X64-NEXT: orl %eax, %edx
+; X64-NEXT: movw %dx, (%rdi)
+; X64-NEXT: retq
%extbit = zext i1 %bit to i24
%b = load i24, i24* %a, align 1
%extbit.shl = shl nuw nsw i24 %extbit, 13
@@ -61,22 +108,28 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
}
define void @i56_or(i56* %a) {
-; CHECK-LABEL: i56_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl 4(%rdi), %eax
-; CHECK-NEXT: movzbl 6(%rdi), %ecx
-; CHECK-NEXT: movl (%rdi), %edx
-; CHECK-NEXT: movb %cl, 6(%rdi)
-; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq $384, %rdx # imm = 0x180
-; CHECK-NEXT: movl %edx, (%rdi)
-; CHECK-NEXT: shrq $32, %rdx
-; CHECK-NEXT: movw %dx, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl $384, (%eax) # imm = 0x180
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: movl (%rdi), %edx
+; X64-NEXT: movb %cl, 6(%rdi)
+; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: orq $384, %rdx # imm = 0x180
+; X64-NEXT: movl %edx, (%rdi)
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movw %dx, 4(%rdi)
+; X64-NEXT: retq
%aa = load i56, i56* %a, align 1
%b = or i56 %aa, 384
store i56 %b, i56* %a, align 1
@@ -84,24 +137,33 @@ define void @i56_or(i56* %a) {
}
define void @i56_and_or(i56* %a) {
-; CHECK-LABEL: i56_and_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl 4(%rdi), %eax
-; CHECK-NEXT: movzbl 6(%rdi), %ecx
-; CHECK-NEXT: movl (%rdi), %edx
-; CHECK-NEXT: movb %cl, 6(%rdi)
-; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq $384, %rdx # imm = 0x180
-; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
-; CHECK-NEXT: andq %rdx, %rax
-; CHECK-NEXT: movl %eax, (%rdi)
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movw %ax, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_and_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $384, %ecx # imm = 0x180
+; X86-NEXT: orl (%eax), %ecx
+; X86-NEXT: andl $-128, %ecx
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_and_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: movl (%rdi), %edx
+; X64-NEXT: movb %cl, 6(%rdi)
+; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: orq $384, %rdx # imm = 0x180
+; X64-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movw %ax, 4(%rdi)
+; X64-NEXT: retq
%b = load i56, i56* %a, align 1
%c = and i56 %b, -128
%d = or i56 %c, 384
@@ -110,26 +172,37 @@ define void @i56_and_or(i56* %a) {
}
define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
-; CHECK-LABEL: i56_insert_bit:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: movzwl 4(%rdi), %ecx
-; CHECK-NEXT: movzbl 6(%rdi), %edx
-; CHECK-NEXT: movl (%rdi), %esi
-; CHECK-NEXT: movb %dl, 6(%rdi)
-; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: shlq $32, %rdx
-; CHECK-NEXT: orq %rdx, %rsi
-; CHECK-NEXT: shlq $13, %rax
-; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movl %ecx, (%rdi)
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movw %cx, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_insert_bit:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $13, %ecx
+; X86-NEXT: movl $-8193, %edx # imm = 0xDFFF
+; X86-NEXT: andl (%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_insert_bit:
+; X64: # BB#0:
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzwl 4(%rdi), %ecx
+; X64-NEXT: movzbl 6(%rdi), %edx
+; X64-NEXT: movl (%rdi), %esi
+; X64-NEXT: movb %dl, 6(%rdi)
+; X64-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
+; X64-NEXT: shll $16, %edx
+; X64-NEXT: orl %ecx, %edx
+; X64-NEXT: shlq $32, %rdx
+; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: shlq $13, %rax
+; X64-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
+; X64-NEXT: andq %rsi, %rcx
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: movw %cx, 4(%rdi)
+; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, i56* %a, align 1
%extbit.shl = shl nuw nsw i56 %extbit, 13
diff --git a/test/CodeGen/X86/optimize-max-1.ll b/test/CodeGen/X86/optimize-max-1.ll
index 11e2f9a93a57f..08cb86ab39896 100644
--- a/test/CodeGen/X86/optimize-max-1.ll
+++ b/test/CodeGen/X86/optimize-max-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 | not grep cmov
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; LSR should be able to eliminate both smax and umax expressions
; in loop trip counts.
@@ -6,6 +7,18 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define void @fs(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: fs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jl .LBB0_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp slt i64 %n, 1 ; <i1> [#uses=1]
%smax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1]
@@ -24,6 +37,18 @@ return: ; preds = %bb
}
define void @bs(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: bs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jl .LBB1_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp sge i64 %n, 1 ; <i1> [#uses=1]
%smax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1]
@@ -42,6 +67,18 @@ return: ; preds = %bb
}
define void @fu(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: fu:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jb .LBB2_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %n, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1]
@@ -60,6 +97,18 @@ return: ; preds = %bb
}
define void @bu(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: bu:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB3_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jb .LBB3_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp ne i64 %n, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/optimize-max-2.ll b/test/CodeGen/X86/optimize-max-2.ll
index 45b542e2267c1..37d2a20975a04 100644
--- a/test/CodeGen/X86/optimize-max-2.ll
+++ b/test/CodeGen/X86/optimize-max-2.ll
@@ -1,8 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep cmov | count 2
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-; CHECK: jne
-; CHECK-NOT: jne
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; LSR's OptimizeMax function shouldn't try to eliminate this max, because
; it has three operands.
@@ -10,6 +7,24 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define void @foo(double* nocapture %p, i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovneq %rdx, %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: cmovbeq %rsi, %rax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %bb4
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: movsd %xmm0, (%rdi)
+; CHECK-NEXT: addq $8, %rdi
+; CHECK-NEXT: decq %rax
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %y, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 1, i64 %y ; <i64> [#uses=2]
@@ -30,3 +45,4 @@ bb4: ; preds = %bb4, %entry
return: ; preds = %bb4
ret void
}
+
diff --git a/test/CodeGen/X86/pr15309.ll b/test/CodeGen/X86/pr15309.ll
index e9d9b9e54c137..0301b58def1c3 100644
--- a/test/CodeGen/X86/pr15309.ll
+++ b/test/CodeGen/X86/pr15309.ll
@@ -1,15 +1,43 @@
-; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux-pc | FileCheck %s
-define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) noinline {
-L.entry:
- %0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10
- %1 = load <2 x i64>, <2 x i64>* %0, align 16
- %2 = uitofp <2 x i64> %1 to <2 x float>
- %3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10
- store <2 x float> %2, <2 x float>* %3, align 8
+define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) nounwind {
+; CHECK-LABEL: test_convert_float2_ulong2:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl $20, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl 168(%ecx), %edx
+; CHECK-NEXT: movl 172(%ecx), %esi
+; CHECK-NEXT: movl 160(%ecx), %edi
+; CHECK-NEXT: movl 164(%ecx), %ecx
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, (%esp)
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setns %dl
+; CHECK-NEXT: fildll (%esp)
+; CHECK-NEXT: fadds {{\.LCPI.*}}(,%edx,4)
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setns %cl
+; CHECK-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
+; CHECK-NEXT: fstps 84(%eax)
+; CHECK-NEXT: fstps 80(%eax)
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: retl
+ %t0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10
+ %t1 = load <2 x i64>, <2 x i64>* %t0, align 16
+ %t2 = uitofp <2 x i64> %t1 to <2 x float>
+ %t3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10
+ store <2 x float> %t2, <2 x float>* %t3, align 8
ret void
}
-; CHECK: test_convert_float2_ulong2
-; CHECK-NOT: cvtpd2ps
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr23603.ll b/test/CodeGen/X86/pr23603.ll
index 6f856aedb8d58..315e60768613a 100644
--- a/test/CodeGen/X86/pr23603.ll
+++ b/test/CodeGen/X86/pr23603.ll
@@ -1,14 +1,29 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
declare void @free_v()
-define void @f(i32* %x, i32 %c32, i32* %y) {
-; CHECK-LABEL: f
+define void @f(i32* %x, i32 %c32, i32* %y) nounwind {
+; CHECK-LABEL: f:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: movl (%rdi), %ebx
+; CHECK-NEXT: callq free_v
+; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # BB#1: # %left
+; CHECK-NEXT: movl %ebx, (%r14)
+; CHECK-NEXT: .LBB0_2: # %merge
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
entry:
%v = load i32, i32* %x, !invariant.load !0
-; CHECK: movl (%rdi), %ebx
-; CHECK: free_v
-; CHECK-NOT: movl (%rdi), %ebx
call void @free_v()
%c = icmp ne i32 %c32, 0
br i1 %c, label %left, label %merge
diff --git a/test/CodeGen/X86/pr33715.ll b/test/CodeGen/X86/pr33715.ll
new file mode 100644
index 0000000000000..15432cfdb512c
--- /dev/null
+++ b/test/CodeGen/X86/pr33715.ll
@@ -0,0 +1,16 @@
+; Make sure we don't crash with a build vector of integer constants.
+; RUN: llc %s -o /dev/null
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @patatino() {
+ %tmp = insertelement <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>, i32 1, i32 2
+ %tmp1 = insertelement <4 x i32> %tmp, i32 1, i32 3
+ %tmp2 = icmp ne <4 x i32> %tmp1, zeroinitializer
+ %tmp3 = icmp slt <4 x i32> %tmp1, <i32 4, i32 4, i32 4, i32 4>
+ %tmp4 = or <4 x i1> %tmp2, %tmp3
+ %tmp5 = select <4 x i1> %tmp4, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+ %tmp6 = extractelement <4 x i32> %tmp5, i32 0
+ ret i32 %tmp6
+}
diff --git a/test/CodeGen/X86/rdrand-x86_64.ll b/test/CodeGen/X86/rdrand-x86_64.ll
new file mode 100644
index 0000000000000..06f1136087bbd
--- /dev/null
+++ b/test/CodeGen/X86/rdrand-x86_64.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
+
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i32 @_rdrand64_step(i64* %random_val) {
+; CHECK-LABEL: _rdrand64_step:
+; CHECK: # BB#0:
+; CHECK-NEXT: rdrandq %rcx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovael %ecx, %eax
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ %call = call {i64, i32} @llvm.x86.rdrand.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ store i64 %randval, i64* %random_val
+ %isvalid = extractvalue {i64, i32} %call, 1
+ ret i32 %isvalid
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index 107cde05a0e6f..0638e00952822 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -1,66 +1,117 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X64
+
declare {i16, i32} @llvm.x86.rdrand.16()
declare {i32, i32} @llvm.x86.rdrand.32()
-declare {i64, i32} @llvm.x86.rdrand.64()
define i32 @_rdrand16_step(i16* %random_val) {
+; X86-LABEL: _rdrand16_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdrandw %ax
+; X86-NEXT: movzwl %ax, %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movw %dx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdrand16_step:
+; X64: # BB#0:
+; X64-NEXT: rdrandw %ax
+; X64-NEXT: movzwl %ax, %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%call = call {i16, i32} @llvm.x86.rdrand.16()
%randval = extractvalue {i16, i32} %call, 0
store i16 %randval, i16* %random_val
%isvalid = extractvalue {i16, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdrand16_step:
-; CHECK: rdrandw %ax
-; CHECK: movzwl %ax, %ecx
-; CHECK: movl $1, %eax
-; CHECK: cmovael %ecx, %eax
-; CHECK: movw %cx, (%r[[A0:di|cx]])
-; CHECK: ret
}
define i32 @_rdrand32_step(i32* %random_val) {
+; X86-LABEL: _rdrand32_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdrandl %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdrand32_step:
+; X64: # BB#0:
+; X64-NEXT: rdrandl %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: retq
%call = call {i32, i32} @llvm.x86.rdrand.32()
%randval = extractvalue {i32, i32} %call, 0
store i32 %randval, i32* %random_val
%isvalid = extractvalue {i32, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdrand32_step:
-; CHECK: rdrandl %e[[T0:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T0]], %eax
-; CHECK: movl %e[[T0]], (%r[[A0]])
-; CHECK: ret
-}
-
-define i32 @_rdrand64_step(i64* %random_val) {
- %call = call {i64, i32} @llvm.x86.rdrand.64()
- %randval = extractvalue {i64, i32} %call, 0
- store i64 %randval, i64* %random_val
- %isvalid = extractvalue {i64, i32} %call, 1
- ret i32 %isvalid
-; CHECK-LABEL: _rdrand64_step:
-; CHECK: rdrandq %r[[T1:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T1]], %eax
-; CHECK: movq %r[[T1]], (%r[[A0]])
-; CHECK: ret
}
; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
define i32 @CSE() nounwind {
+; X86-LABEL: CSE:
+; X86: # BB#0:
+; X86-NEXT: rdrandl %ecx
+; X86-NEXT: rdrandl %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: CSE:
+; X64: # BB#0:
+; X64-NEXT: rdrandl %ecx
+; X64-NEXT: rdrandl %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: retq
%rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%v1 = extractvalue { i32, i32 } %rand1, 0
%rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%v2 = extractvalue { i32, i32 } %rand2, 0
%add = add i32 %v2, %v1
ret i32 %add
-; CHECK-LABEL: CSE:
-; CHECK: rdrandl
-; CHECK: rdrandl
}
; Check that MachineLICM doesn't hoist rdrand instructions.
define void @loop(i32* %p, i32 %n) nounwind {
+; X86-LABEL: loop:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: je .LBB3_3
+; X86-NEXT: # BB#1: # %while.body.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB3_2: # %while.body
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: rdrandl %edx
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: leal 4(%ecx), %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB3_2
+; X86-NEXT: .LBB3_3: # %while.end
+; X86-NEXT: retl
+;
+; X64-LABEL: loop:
+; X64: # BB#0: # %entry
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: je .LBB3_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB3_1: # %while.body
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: rdrandl %eax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: leaq 4(%rdi), %rdi
+; X64-NEXT: decl %esi
+; X64-NEXT: jne .LBB3_1
+; X64-NEXT: .LBB3_2: # %while.end
+; X64-NEXT: retq
entry:
%tobool1 = icmp eq i32 %n, 0
br i1 %tobool1, label %while.end, label %while.body
@@ -78,8 +129,4 @@ while.body: ; preds = %entry, %while.body
while.end: ; preds = %while.body, %entry
ret void
-; CHECK-LABEL: loop:
-; CHECK-NOT: rdrandl
-; CHECK: This Inner Loop Header: Depth=1
-; CHECK: rdrandl
}
diff --git a/test/CodeGen/X86/rdseed-x86_64.ll b/test/CodeGen/X86/rdseed-x86_64.ll
new file mode 100644
index 0000000000000..b0d9748dd6ae1
--- /dev/null
+++ b/test/CodeGen/X86/rdseed-x86_64.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s
+
+declare {i64, i32} @llvm.x86.rdseed.64()
+
+define i32 @_rdseed64_step(i64* %random_val) {
+; CHECK-LABEL: _rdseed64_step:
+; CHECK: # BB#0:
+; CHECK-NEXT: rdseedq %rcx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovael %ecx, %eax
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ %call = call {i64, i32} @llvm.x86.rdseed.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ store i64 %randval, i64* %random_val
+ %isvalid = extractvalue {i64, i32} %call, 1
+ ret i32 %isvalid
+}
diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll
index c219b4ad27ece..b22e3e7ceac07 100644
--- a/test/CodeGen/X86/rdseed.ll
+++ b/test/CodeGen/X86/rdseed.ll
@@ -1,48 +1,56 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X64
declare {i16, i32} @llvm.x86.rdseed.16()
declare {i32, i32} @llvm.x86.rdseed.32()
-declare {i64, i32} @llvm.x86.rdseed.64()
define i32 @_rdseed16_step(i16* %random_val) {
+; X86-LABEL: _rdseed16_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdseedw %ax
+; X86-NEXT: movzwl %ax, %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movw %dx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdseed16_step:
+; X64: # BB#0:
+; X64-NEXT: rdseedw %ax
+; X64-NEXT: movzwl %ax, %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%call = call {i16, i32} @llvm.x86.rdseed.16()
%randval = extractvalue {i16, i32} %call, 0
store i16 %randval, i16* %random_val
%isvalid = extractvalue {i16, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdseed16_step:
-; CHECK: rdseedw %ax
-; CHECK: movzwl %ax, %ecx
-; CHECK: movl $1, %eax
-; CHECK: cmovael %ecx, %eax
-; CHECK: movw %cx, (%r[[A0:di|cx]])
-; CHECK: ret
}
define i32 @_rdseed32_step(i32* %random_val) {
+; X86-LABEL: _rdseed32_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdseedl %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdseed32_step:
+; X64: # BB#0:
+; X64-NEXT: rdseedl %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: retq
%call = call {i32, i32} @llvm.x86.rdseed.32()
%randval = extractvalue {i32, i32} %call, 0
store i32 %randval, i32* %random_val
%isvalid = extractvalue {i32, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdseed32_step:
-; CHECK: rdseedl %e[[T0:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T0]], %eax
-; CHECK: movl %e[[T0]], (%r[[A0]])
-; CHECK: ret
-}
-
-define i32 @_rdseed64_step(i64* %random_val) {
- %call = call {i64, i32} @llvm.x86.rdseed.64()
- %randval = extractvalue {i64, i32} %call, 0
- store i64 %randval, i64* %random_val
- %isvalid = extractvalue {i64, i32} %call, 1
- ret i32 %isvalid
-; CHECK-LABEL: _rdseed64_step:
-; CHECK: rdseedq %r[[T1:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T1]], %eax
-; CHECK: movq %r[[T1]], (%r[[A0]])
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 16e261bf3c5e0..02a968c6f27d1 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -45,9 +45,9 @@ define float @f32_no_estimate(float %x) #0 {
;
; SANDY-LABEL: f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_estimate:
; HASWELL: # BB#0:
@@ -113,11 +113,11 @@ define float @f32_one_step(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step:
; HASWELL: # BB#0:
@@ -207,7 +207,7 @@ define float @f32_two_step(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -215,7 +215,7 @@ define float @f32_two_step(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step:
; HASWELL: # BB#0:
@@ -284,25 +284,25 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; SANDY-LABEL: v4f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
; HASWELL-NEXT: retq # sched: [1:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v4f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
; AVX512-NEXT: retq # sched: [1:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -350,18 +350,18 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: retq # sched: [1:1.00]
@@ -370,7 +370,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -379,7 +379,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: retq # sched: [1:1.00]
@@ -453,9 +453,9 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -463,12 +463,12 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -480,7 +480,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -493,7 +493,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -504,7 +504,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -541,30 +541,30 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; BTVER2-LABEL: v8f32_no_estimate:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:19.00]
+; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
; HASWELL-NEXT: retq # sched: [1:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v8f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
; AVX512-NEXT: retq # sched: [1:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -610,27 +610,27 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: retq # sched: [1:1.00]
@@ -639,7 +639,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -648,7 +648,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: retq # sched: [1:1.00]
@@ -722,22 +722,22 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -745,12 +745,12 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -762,7 +762,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -775,7 +775,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; KNL-LABEL: v8f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -786,7 +786,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index 440a6f0bef13a..c82eab84757f4 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -39,8 +39,8 @@ define float @f32_no_step_2(float %x) #3 {
; SANDY-LABEL: f32_no_step_2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_step_2:
; HASWELL: # BB#0:
@@ -110,12 +110,12 @@ define float @f32_one_step_2(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # BB#0:
@@ -198,13 +198,13 @@ define float @f32_one_step_2_divs(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2_divs:
; HASWELL: # BB#0:
@@ -305,7 +305,7 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -313,8 +313,8 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step_2:
; HASWELL: # BB#0:
@@ -403,19 +403,19 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -425,7 +425,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -435,7 +435,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -501,20 +501,20 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step_2_divs:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -525,7 +525,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -536,7 +536,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -619,9 +619,9 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -629,13 +629,13 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -648,7 +648,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -662,7 +662,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -674,7 +674,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -729,29 +729,29 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step2:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
@@ -761,7 +761,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -771,7 +771,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
@@ -835,31 +835,31 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step_2_divs:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
@@ -870,7 +870,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -881,7 +881,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
@@ -964,23 +964,23 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -988,13 +988,13 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1007,7 +1007,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1021,7 +1021,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; KNL-LABEL: v8f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1033,7 +1033,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1064,13 +1064,13 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; BTVER2-LABEL: v8f32_no_step:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step:
; HASWELL: # BB#0:
@@ -1118,15 +1118,15 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; BTVER2-LABEL: v8f32_no_step2:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step2:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
index ba8ff1bc1819c..3bb14c4b1cd83 100644
--- a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
+; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
; Test case for the recoloring of broken hints.
; This is tricky to have something reasonably small to kick this optimization since
; it requires that spliting and spilling occur.
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
index 56a7d32850569..c7117be91ab47 100644
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@@ -1,17 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; Check that we recognize this idiom for rotation too:
; a << (b & (OpSize-1)) | a >> ((0 - b) & (OpSize-1))
define i32 @rotate_left_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_left_32:
-; CHECK-NOT: and
-; CHECK: roll
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: roll %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%and = and i32 %b, 31
%shl = shl i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = lshr i32 %a, %and3
%or = or i32 %shl, %shr
ret i32 %or
@@ -19,13 +22,15 @@ entry:
define i32 @rotate_right_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_right_32:
-; CHECK-NOT: and
-; CHECK: rorl
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorl %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%and = and i32 %b, 31
%shl = lshr i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = shl i32 %a, %and3
%or = or i32 %shl, %shr
ret i32 %or
@@ -33,13 +38,15 @@ entry:
define i64 @rotate_left_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_left_64:
-; CHECK-NOT: and
-; CHECK: rolq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%and = and i64 %b, 63
%shl = shl i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = lshr i64 %a, %and3
%or = or i64 %shl, %shr
ret i64 %or
@@ -47,13 +54,15 @@ entry:
define i64 @rotate_right_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_right_64:
-; CHECK-NOT: and
-; CHECK: rorq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%and = and i64 %b, 63
%shl = lshr i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = shl i64 %a, %and3
%or = or i64 %shl, %shr
ret i64 %or
@@ -63,16 +72,15 @@ entry:
define void @rotate_left_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_left_m32:
-; CHECK-NOT: and
-; CHECK: roll
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: roll %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i32, i32* %pa, align 16
%and = and i32 %b, 31
%shl = shl i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = lshr i32 %a, %and3
%or = or i32 %shl, %shr
store i32 %or, i32* %pa, align 32
@@ -81,16 +89,15 @@ entry:
define void @rotate_right_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_right_m32:
-; CHECK-NOT: and
-; CHECK: rorl
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorl %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i32, i32* %pa, align 16
%and = and i32 %b, 31
%shl = lshr i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = shl i32 %a, %and3
%or = or i32 %shl, %shr
store i32 %or, i32* %pa, align 32
@@ -99,16 +106,15 @@ entry:
define void @rotate_left_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_left_m64:
-; CHECK-NOT: and
-; CHECK: rolq
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolq %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i64, i64* %pa, align 16
%and = and i64 %b, 63
%shl = shl i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = lshr i64 %a, %and3
%or = or i64 %shl, %shr
store i64 %or, i64* %pa, align 64
@@ -117,18 +123,18 @@ entry:
define void @rotate_right_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_right_m64:
-; CHECK-NOT: and
-; CHECK: rorq
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorq %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i64, i64* %pa, align 16
%and = and i64 %b, 63
%shl = lshr i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = shl i64 %a, %and3
%or = or i64 %shl, %shr
store i64 %or, i64* %pa, align 64
ret void
}
+
diff --git a/test/CodeGen/X86/sbb.ll b/test/CodeGen/X86/sbb.ll
index 414780b2d4e65..b6e8ebf6ed068 100644
--- a/test/CodeGen/X86/sbb.ll
+++ b/test/CodeGen/X86/sbb.ll
@@ -146,10 +146,8 @@ define i32 @ugt_select_neg1_or_0(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl %esi, %edi
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp uge i32 %x, %y
%ext = zext i1 %cmp to i32
@@ -163,10 +161,8 @@ define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ule_select_0_or_neg1:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpl %edi, %esi
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp ule i32 %y, %x
%ext = zext i1 %cmp to i32
@@ -180,10 +176,8 @@ define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1_sub:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl %esi, %edi
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp uge i32 %x, %y
%ext = zext i1 %cmp to i32
@@ -191,6 +185,38 @@ define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
ret i32 %sub
}
+; Check more sub-from-zero patterns.
+; (X >u Y) ? -1 : 0 --> cmp, sbb
+
+define i64 @ugt_select_neg1_or_0_sub(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ugt_select_neg1_or_0_sub:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpq %rdi, %rsi
+; CHECK-NEXT: sbbq %rax, %rax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i64 %x, %y
+ %zext = zext i1 %cmp to i64
+ %sub = sub i64 0, %zext
+ ret i64 %sub
+}
+
+; Swap the predicate and compare operands:
+; (Y <u X) ? -1 : 0 --> cmp, sbb
+
+define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind {
+; CHECK-LABEL: ult_select_neg1_or_0_sub:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpw %di, %si
+; CHECK-NEXT: sbbw %ax, %ax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i16 %y, %x
+ %zext = zext i1 %cmp to i16
+ %sub = sub i16 0, %zext
+ ret i16 %sub
+}
+
+
+
; Make sure we're creating nodes with the right value types. This would crash.
; https://bugs.llvm.org/show_bug.cgi?id=33560
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index a97e7c299e73d..0eb9bf46ffd15 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -205,6 +205,111 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) {
ret i32 %sel
}
+; If the constants differ by a small multiplier, use LEA.
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> LEA C2(Cond * (C1-C2))
+
+define i32 @select_lea_2(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -1, i32 1
+ ret i32 %sel
+}
+
+define i64 @select_lea_3(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_3:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: movq $-2, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -2, i64 1
+ ret i64 %sel
+}
+
+define i32 @select_lea_5(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_5:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-2, %ecx
+; CHECK-NEXT: movl $3, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -2, i32 3
+ ret i32 %sel
+}
+
+define i64 @select_lea_9(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_9:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $2, %ecx
+; CHECK-NEXT: movq $-7, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -7, i64 2
+ ret i64 %sel
+}
+
+
+; If the constants differ by a large power-of-2, that can be a shift of the difference plus the smaller constant.
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2
+
+define i8 @select_pow2_diff(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movb $19, %al
+; CHECK-NEXT: jne .LBB22_2
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: movb $3, %al
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i8 19, i8 3
+ ret i8 %sel
+}
+
+define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movw $7, %cx
+; CHECK-NEXT: movw $71, %ax
+; CHECK-NEXT: cmovnew %cx, %ax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i16 7, i16 71
+ ret i16 %sel
+}
+
+define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_neg:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-9, %ecx
+; CHECK-NEXT: movl $-25, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -9, i32 -25
+ ret i32 %sel
+}
+
+define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_neg_invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $29, %ecx
+; CHECK-NEXT: movq $-99, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -99, i64 29
+ ret i64 %sel
+}
+
; In general, select of 2 constants could be:
; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
@@ -263,11 +368,11 @@ define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_add_constant_vec:
; CHECK: # BB#0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB22_1
+; CHECK-NEXT: jne .LBB30_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12,13,14,15]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB22_1:
+; CHECK-NEXT: .LBB30_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967293,14,4,4]
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 12, i32 1, i32 0>, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
@@ -279,11 +384,11 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_fmul_constant_vec:
; CHECK: # BB#0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB23_1
+; CHECK-NEXT: jne .LBB31_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.188300e+02,3.454000e+01]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB23_1:
+; CHECK-NEXT: .LBB31_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [-2.040000e+01,3.768000e+01]
; CHECK-NEXT: retq
%sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
diff --git a/test/CodeGen/X86/shift-codegen.ll b/test/CodeGen/X86/shift-codegen.ll
index 7d52bdeb9e3ab..295a55d86a00d 100644
--- a/test/CodeGen/X86/shift-codegen.ll
+++ b/test/CodeGen/X86/shift-codegen.ll
@@ -1,38 +1,36 @@
-; RUN: llc < %s -relocation-model=static -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -relocation-model=static -mtriple=i686-unknown-unknown | FileCheck %s
; This should produce two shll instructions, not any lea's.
target triple = "i686-apple-darwin8"
-@Y = weak global i32 0 ; <i32*> [#uses=1]
-@X = weak global i32 0 ; <i32*> [#uses=2]
-
+@Y = weak global i32 0
+@X = weak global i32 0
define void @fn1() {
; CHECK-LABEL: fn1:
-; CHECK-NOT: ret
-; CHECK-NOT: lea
-; CHECK: shll $3
-; CHECK-NOT: lea
-; CHECK: ret
-
- %tmp = load i32, i32* @Y ; <i32> [#uses=1]
- %tmp1 = shl i32 %tmp, 3 ; <i32> [#uses=1]
- %tmp2 = load i32, i32* @X ; <i32> [#uses=1]
- %tmp3 = or i32 %tmp1, %tmp2 ; <i32> [#uses=1]
+; CHECK: # BB#0:
+; CHECK-NEXT: movl Y, %eax
+; CHECK-NEXT: shll $3, %eax
+; CHECK-NEXT: orl %eax, X
+; CHECK-NEXT: retl
+ %tmp = load i32, i32* @Y
+ %tmp1 = shl i32 %tmp, 3
+ %tmp2 = load i32, i32* @X
+ %tmp3 = or i32 %tmp1, %tmp2
store i32 %tmp3, i32* @X
ret void
}
define i32 @fn2(i32 %X, i32 %Y) {
; CHECK-LABEL: fn2:
-; CHECK-NOT: ret
-; CHECK-NOT: lea
-; CHECK: shll $3
-; CHECK-NOT: lea
-; CHECK: ret
-
- %tmp2 = shl i32 %Y, 3 ; <i32> [#uses=1]
- %tmp4 = or i32 %tmp2, %X ; <i32> [#uses=1]
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $3, %eax
+; CHECK-NEXT: orl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
+ %tmp2 = shl i32 %Y, 3
+ %tmp4 = or i32 %tmp2, %X
ret i32 %tmp4
}
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 6988787089778..76cf4a41a6cbe 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-coalescing | FileCheck %s
define i32* @test1(i32* %P, i32 %X) {
; CHECK-LABEL: test1:
-; CHECK-NOT: shrl
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = lshr i32 %X, 2
%gep.upgrd.1 = zext i32 %Y to i64
%P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.1
@@ -15,11 +16,11 @@ entry:
define i32* @test2(i32* %P, i32 %X) {
; CHECK-LABEL: test2:
-; CHECK: shll $4
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = shl i32 %X, 2
%gep.upgrd.2 = zext i32 %Y to i64
%P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.2
@@ -28,11 +29,11 @@ entry:
define i32* @test3(i32* %P, i32 %X) {
; CHECK-LABEL: test3:
-; CHECK-NOT: shrl
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = ashr i32 %X, 2
%P2 = getelementptr i32, i32* %P, i32 %Y
ret i32* %P2
@@ -40,25 +41,27 @@ entry:
define fastcc i32 @test4(i32* %d) {
; CHECK-LABEL: test4:
-; CHECK-NOT: shrl
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl 3(%ecx), %eax
+; CHECK-NEXT: retl
%tmp4 = load i32, i32* %d
%tmp512 = lshr i32 %tmp4, 24
ret i32 %tmp512
}
-define i64 @test5(i16 %i, i32* %arr) {
; Ensure that we don't fold away shifts which have multiple uses, as they are
; just re-introduced for the second use.
-; CHECK-LABEL: test5:
-; CHECK-NOT: shrl
-; CHECK: shrl $11
-; CHECK-NOT: shrl
-; CHECK: ret
-entry:
+define i64 @test5(i16 %i, i32* %arr) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $11, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: addl (%ecx,%eax,4), %eax
+; CHECK-NEXT: setb %dl
+; CHECK-NEXT: retl
%i.zext = zext i16 %i to i32
%index = lshr i32 %i.zext, 11
%index.zext = zext i32 %index to i64
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index ee8921c41a063..c84869433546b 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -37,24 +37,16 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -62,11 +54,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -74,12 +62,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -166,11 +149,8 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX2-LABEL: shuffle_v16i16_to_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -178,11 +158,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -190,42 +166,22 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -293,48 +249,50 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX-LABEL: shuffle_v8i32_to_v4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps (%rdi), %ymm0
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vmovaps %xmm0, (%rsi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i32_to_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vmovaps %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_to_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
@@ -413,11 +371,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX2-LABEL: shuffle_v32i8_to_v8i8:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -425,11 +381,8 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -437,39 +390,23 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -542,26 +479,19 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: shuffle_v16i16_to_v4i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_to_v4i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -569,12 +499,8 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -582,31 +508,23 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -676,24 +594,19 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v4i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vmovd %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_to_v4i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovd %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -701,11 +614,8 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -713,30 +623,23 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -802,3 +705,73 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
store <4 x i8> %strided.vec, <4 x i8>* %S
ret void
}
+
+; In this case not all elements are collected from the same source vector, so
+; the resulting BUILD_VECTOR should not be combined to a truncate.
+define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
+; AVX1-LABEL: negative:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: negative:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: negative:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: negative:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: negative:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: negative:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
+; AVX512BWVL-NEXT: kmovd %eax, %k1
+; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BWVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %w0 = extractelement <32 x i8> %w, i32 0
+ %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
+ ret <16 x i8> %merged
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index a3ba589758009..69155b5cc565a 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -11,49 +11,37 @@
define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1
+; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -106,54 +94,12 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
-; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i16> %strided.vec, <16 x i16>* %S
@@ -177,11 +123,8 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: shuffle_v16i32_to_v8i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
@@ -205,127 +148,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
}
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm1
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
-; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -347,99 +175,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vmovd %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vmovd %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -461,95 +202,12 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %r8d
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %r9d
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %r10d
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %r11d
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi
-; AVX512BW-NEXT: vmovd %edi, %xmm0
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $4, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
-; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
store <8 x i8> %strided.vec, <8 x i8>* %S
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index 5436cf248bd51..d0b8972cee503 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -2,7 +2,7 @@
; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
; Test that by changing BlockFrequencyInfo we change the order in which
-; machine-sink looks for sucessor blocks. By not using BFI, both G and B
+; machine-sink looks for successor blocks. By not using BFI, both G and B
; have the same loop depth and no instructions is sinked - B is selected but
; can't be used as to avoid breaking a non profitable critical edge. By using
; BFI, "mul" is sinked into the less frequent block G.
diff --git a/test/CodeGen/X86/sink-gep-before-mem-inst.ll b/test/CodeGen/X86/sink-gep-before-mem-inst.ll
new file mode 100644
index 0000000000000..b9c94adda9936
--- /dev/null
+++ b/test/CodeGen/X86/sink-gep-before-mem-inst.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -S -codegenprepare -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define i64 @test.after(i8 addrspace(1)* readonly align 8) {
+; CHECK-LABEL: test.after
+; CHECK: sunkaddr
+entry:
+ %.0 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+ %addr = bitcast i8 addrspace(1)* %.0 to i32 addrspace(1)*
+ br label %header
+
+header:
+ %addr.in.loop = phi i32 addrspace(1)* [ %addr, %entry ], [ %addr.after, %header ]
+ %local_2_ = phi i64 [ 0, %entry ], [ %.9, %header ]
+ %.7 = load i32, i32 addrspace(1)* %addr.in.loop, align 8
+ fence acquire
+ %.1 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+ %addr.after = bitcast i8 addrspace(1)* %.1 to i32 addrspace(1)*
+ %.8 = sext i32 %.7 to i64
+ %.9 = add i64 %local_2_, %.8
+ %not. = icmp sgt i64 %.9, 999
+ br i1 %not., label %exit, label %header
+
+exit:
+ ret i64 %.9
+}
diff --git a/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
new file mode 100644
index 0000000000000..0461ee809efb5
--- /dev/null
+++ b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
+;
+; D31946
+; Check that we dont end up with the ""LLVM ERROR: Cannot select" error.
+; Additionally ensure that the output code actually put fp128 values in SSE registers.
+
+declare fp128 @llvm.fabs.f128(fp128)
+declare fp128 @llvm.copysign.f128(fp128, fp128)
+
+define fp128 @TestSelect(fp128 %a, fp128 %b) {
+ %cmp = fcmp ogt fp128 %a, %b
+ %sub = fsub fp128 %a, %b
+ %res = select i1 %cmp, fp128 %sub, fp128 0xL00000000000000000000000000000000
+ ret fp128 %res
+; CHECK-LABEL: TestSelect:
+; CHECK movaps 16(%rsp), %xmm1
+; CHECK-NEXT callq __subtf3
+; CHECK-NEXT testl %ebx, %ebx
+; CHECK-NEXT jg .LBB0_2
+; CHECK-NEXT # BB#1:
+; CHECK-NEXT movaps .LCPI0_0(%rip), %xmm0
+; CHECK-NEXT .LBB0_2:
+; CHECK-NEXT addq $32, %rsp
+; CHECK-NEXT popq %rbx
+; CHECK-NEXT retq
+}
+
+define fp128 @TestFabs(fp128 %a) {
+ %res = call fp128 @llvm.fabs.f128(fp128 %a)
+ ret fp128 %res
+; CHECK-LABEL: TestFabs:
+; CHECK andps .LCPI1_0(%rip), %xmm0
+; CHECK-NEXT retq
+}
+
+define fp128 @TestCopysign(fp128 %a, fp128 %b) {
+ %res = call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+ ret fp128 %res
+; CHECK-LABEL: TestCopysign:
+; CHECK andps .LCPI2_1(%rip), %xmm0
+; CHECK-NEXT orps %xmm1, %xmm0
+; CHECK-NEXT retq
+}
+
+define fp128 @TestFneg(fp128 %a) {
+ %mul = fmul fp128 %a, %a
+ %res = fsub fp128 0xL00000000000000008000000000000000, %mul
+ ret fp128 %res
+; CHECK-LABEL: TestFneg:
+; CHECK movaps %xmm0, %xmm1
+; CHECK-NEXT callq __multf3
+; CHECK-NEXT xorps .LCPI3_0(%rip), %xmm0
+; CHECK-NEXT popq %rax
+; CHECK-NEXT retq
+}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index 52e6b61aedfe8..c41acd43b3ab6 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -31,8 +31,8 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
@@ -73,8 +73,8 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_addss:
; SANDY: # BB#0:
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addss:
; HASWELL: # BB#0:
@@ -122,9 +122,9 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_andps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
@@ -176,9 +176,9 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
@@ -228,9 +228,9 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
@@ -277,7 +277,7 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpss:
; HASWELL: # BB#0:
@@ -347,16 +347,16 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_comiss:
; SANDY: # BB#0:
; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comiss:
; HASWELL: # BB#0:
@@ -417,10 +417,10 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ss:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ss:
; HASWELL: # BB#0:
@@ -466,10 +466,10 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ssq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ssq:
; HASWELL: # BB#0:
@@ -515,10 +515,10 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2si:
; HASWELL: # BB#0:
@@ -567,10 +567,10 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2siq:
; HASWELL: # BB#0:
@@ -619,10 +619,10 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2si:
; HASWELL: # BB#0:
@@ -668,10 +668,10 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2siq:
; HASWELL: # BB#0:
@@ -714,9 +714,9 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
@@ -756,9 +756,9 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
;
; SANDY-LABEL: test_divss:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divss:
; HASWELL: # BB#0:
@@ -799,8 +799,8 @@ define void @test_ldmxcsr(i32 %a0) {
; SANDY-LABEL: test_ldmxcsr:
; SANDY: # BB#0:
; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ldmxcsr:
; HASWELL: # BB#0:
@@ -843,8 +843,8 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
@@ -886,8 +886,8 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxss:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxss:
; HASWELL: # BB#0:
@@ -929,8 +929,8 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minps:
; SANDY: # BB#0:
; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
@@ -972,8 +972,8 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minss:
; SANDY: # BB#0:
; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minss:
; HASWELL: # BB#0:
@@ -1017,10 +1017,10 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
@@ -1068,7 +1068,7 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; SANDY-LABEL: test_movhlps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhlps:
; HASWELL: # BB#0:
@@ -1111,10 +1111,10 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhps:
; HASWELL: # BB#0:
@@ -1164,7 +1164,7 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlhps:
; HASWELL: # BB#0:
@@ -1206,10 +1206,10 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlps:
; HASWELL: # BB#0:
@@ -1254,8 +1254,8 @@ define i32 @test_movmskps(<4 x float> %a0) {
;
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
@@ -1295,8 +1295,8 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
@@ -1335,10 +1335,10 @@ define void @test_movss_mem(float* %a0, float* %a1) {
;
; SANDY-LABEL: test_movss_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_mem:
; HASWELL: # BB#0:
@@ -1383,8 +1383,8 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
;
; SANDY-LABEL: test_movss_reg:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_reg:
; HASWELL: # BB#0:
@@ -1423,10 +1423,10 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
@@ -1469,8 +1469,8 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
@@ -1511,8 +1511,8 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_mulss:
; SANDY: # BB#0:
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulss:
; HASWELL: # BB#0:
@@ -1560,9 +1560,9 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
@@ -1609,8 +1609,8 @@ define void @test_prefetchnta(i8* %a0) {
;
; SANDY-LABEL: test_prefetchnta:
; SANDY: # BB#0:
-; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_prefetchnta:
; HASWELL: # BB#0:
@@ -1652,10 +1652,10 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_rcpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [7:3.00]
+; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
@@ -1708,10 +1708,10 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; SANDY-LABEL: test_rcpss:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpss:
; HASWELL: # BB#0:
@@ -1765,9 +1765,9 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
@@ -1819,11 +1819,11 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
;
; SANDY-LABEL: test_rsqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtss:
; HASWELL: # BB#0:
@@ -1875,7 +1875,7 @@ define void @test_sfence() {
; SANDY-LABEL: test_sfence:
; SANDY: # BB#0:
; SANDY-NEXT: sfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sfence:
; HASWELL: # BB#0:
@@ -1917,8 +1917,8 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
@@ -1962,10 +1962,10 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
@@ -2017,11 +2017,11 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtss:
; HASWELL: # BB#0:
@@ -2067,9 +2067,9 @@ define i32 @test_stmxcsr() {
;
; SANDY-LABEL: test_stmxcsr:
; SANDY: # BB#0:
-; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_stmxcsr:
; HASWELL: # BB#0:
@@ -2112,8 +2112,8 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
@@ -2154,8 +2154,8 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_subss:
; SANDY: # BB#0:
; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subss:
; HASWELL: # BB#0:
@@ -2220,16 +2220,16 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_ucomiss:
; SANDY: # BB#0:
; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomiss:
; HASWELL: # BB#0:
@@ -2292,8 +2292,8 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpckhps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
@@ -2338,8 +2338,8 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
@@ -2387,9 +2387,9 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_xorps:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 14c155c8c6c09..3c36b21381390 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -31,8 +31,8 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
@@ -73,8 +73,8 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_addsd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsd:
; HASWELL: # BB#0:
@@ -117,10 +117,10 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_andpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
@@ -170,10 +170,10 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
@@ -226,9 +226,9 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
@@ -275,7 +275,7 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpsd:
; HASWELL: # BB#0:
@@ -345,16 +345,16 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_comisd:
; SANDY: # BB#0:
; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comisd:
; HASWELL: # BB#0:
@@ -416,9 +416,9 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
@@ -467,10 +467,10 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
;
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
@@ -517,10 +517,10 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
@@ -568,10 +568,10 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
@@ -620,9 +620,9 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvtps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
@@ -670,10 +670,10 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_cvtps2pd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2pd:
; HASWELL: # BB#0:
@@ -724,7 +724,7 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2si:
; HASWELL: # BB#0:
@@ -773,10 +773,10 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvtsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2siq:
; HASWELL: # BB#0:
@@ -830,10 +830,10 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; SANDY-LABEL: test_cvtsd2ss:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2ss:
; HASWELL: # BB#0:
@@ -882,9 +882,9 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; SANDY-LABEL: test_cvtsi2sd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sd:
; HASWELL: # BB#0:
@@ -931,9 +931,9 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; SANDY-LABEL: test_cvtsi2sdq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sdq:
; HASWELL: # BB#0:
@@ -985,11 +985,11 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2sd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2sd:
; HASWELL: # BB#0:
@@ -1038,10 +1038,10 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvttpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttpd2dq:
; HASWELL: # BB#0:
@@ -1091,9 +1091,9 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvttps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttps2dq:
; HASWELL: # BB#0:
@@ -1139,10 +1139,10 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2si:
; HASWELL: # BB#0:
@@ -1188,10 +1188,10 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2siq:
; HASWELL: # BB#0:
@@ -1234,9 +1234,9 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
@@ -1276,9 +1276,9 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
;
; SANDY-LABEL: test_divsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divsd:
; HASWELL: # BB#0:
@@ -1322,7 +1322,7 @@ define void @test_lfence() {
; SANDY-LABEL: test_lfence:
; SANDY: # BB#0:
; SANDY-NEXT: lfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lfence:
; HASWELL: # BB#0:
@@ -1363,7 +1363,7 @@ define void @test_mfence() {
; SANDY-LABEL: test_mfence:
; SANDY: # BB#0:
; SANDY-NEXT: mfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mfence:
; HASWELL: # BB#0:
@@ -1402,7 +1402,7 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
; SANDY-LABEL: test_maskmovdqu:
; SANDY: # BB#0:
; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovdqu:
; HASWELL: # BB#0:
@@ -1440,8 +1440,8 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
@@ -1483,8 +1483,8 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxsd:
; HASWELL: # BB#0:
@@ -1526,8 +1526,8 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minpd:
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
@@ -1569,8 +1569,8 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minsd:
; SANDY: # BB#0:
; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minsd:
; HASWELL: # BB#0:
@@ -1614,10 +1614,10 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
@@ -1662,10 +1662,10 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqa:
; HASWELL: # BB#0:
@@ -1710,10 +1710,10 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqu:
; HASWELL: # BB#0:
@@ -1768,12 +1768,12 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; SANDY-LABEL: test_movd:
; SANDY: # BB#0:
; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd:
; HASWELL: # BB#0:
@@ -1838,13 +1838,13 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
;
; SANDY-LABEL: test_movd_64:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33]
-; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd_64:
; HASWELL: # BB#0:
@@ -1900,10 +1900,10 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhpd:
; HASWELL: # BB#0:
@@ -1951,10 +1951,10 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlpd:
; HASWELL: # BB#0:
@@ -1998,8 +1998,8 @@ define i32 @test_movmskpd(<2 x double> %a0) {
;
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
@@ -2039,8 +2039,8 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
@@ -2080,8 +2080,8 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
@@ -2123,10 +2123,10 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
;
; SANDY-LABEL: test_movq_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_mem:
; HASWELL: # BB#0:
@@ -2174,7 +2174,7 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_reg:
; HASWELL: # BB#0:
@@ -2216,10 +2216,10 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
;
; SANDY-LABEL: test_movsd_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_mem:
; HASWELL: # BB#0:
@@ -2266,7 +2266,7 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; SANDY-LABEL: test_movsd_reg:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_reg:
; HASWELL: # BB#0:
@@ -2305,10 +2305,10 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
@@ -2351,8 +2351,8 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
@@ -2393,8 +2393,8 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_mulsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulsd:
; HASWELL: # BB#0:
@@ -2437,10 +2437,10 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orpd:
; HASWELL: # BB#0:
@@ -2496,8 +2496,8 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packssdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packssdw:
; HASWELL: # BB#0:
@@ -2548,8 +2548,8 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packsswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packsswb:
; HASWELL: # BB#0:
@@ -2600,8 +2600,8 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packuswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packuswb:
; HASWELL: # BB#0:
@@ -2648,8 +2648,8 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddb:
; HASWELL: # BB#0:
@@ -2694,8 +2694,8 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_paddd:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddd:
; HASWELL: # BB#0:
@@ -2736,8 +2736,8 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_paddq:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
@@ -2781,9 +2781,9 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_paddsb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsb:
; HASWELL: # BB#0:
@@ -2828,9 +2828,9 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_paddsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsw:
; HASWELL: # BB#0:
@@ -2876,8 +2876,8 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusb:
; HASWELL: # BB#0:
@@ -2923,8 +2923,8 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_paddusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusw:
; HASWELL: # BB#0:
@@ -2969,9 +2969,9 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_paddw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddw:
; HASWELL: # BB#0:
@@ -3015,9 +3015,9 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pand:
; SANDY: # BB#0:
; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
@@ -3070,9 +3070,9 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pandn:
; SANDY: # BB#0:
; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
@@ -3122,8 +3122,8 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pavgb:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgb:
; HASWELL: # BB#0:
@@ -3169,8 +3169,8 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pavgw:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgw:
; HASWELL: # BB#0:
@@ -3217,9 +3217,9 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpeqb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqb:
; HASWELL: # BB#0:
@@ -3269,9 +3269,9 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpeqd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqd:
; HASWELL: # BB#0:
@@ -3321,9 +3321,9 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpeqw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqw:
; HASWELL: # BB#0:
@@ -3374,9 +3374,9 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpgtb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtb:
; HASWELL: # BB#0:
@@ -3427,9 +3427,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpgtd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtd:
; HASWELL: # BB#0:
@@ -3480,9 +3480,9 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpgtw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtw:
; HASWELL: # BB#0:
@@ -3526,9 +3526,9 @@ define i16 @test_pextrw(<8 x i16> %a0) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
@@ -3570,9 +3570,9 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
;
; SANDY-LABEL: test_pinsrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrw:
; HASWELL: # BB#0:
@@ -3620,9 +3620,9 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmaddwd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddwd:
; HASWELL: # BB#0:
@@ -3669,8 +3669,8 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsw:
; HASWELL: # BB#0:
@@ -3716,8 +3716,8 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxub:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxub:
; HASWELL: # BB#0:
@@ -3763,8 +3763,8 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsw:
; HASWELL: # BB#0:
@@ -3810,8 +3810,8 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminub:
; SANDY: # BB#0:
; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminub:
; HASWELL: # BB#0:
@@ -3851,8 +3851,8 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
;
; SANDY-LABEL: test_pmovmskb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovmskb:
; HASWELL: # BB#0:
@@ -3891,7 +3891,7 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhuw:
; HASWELL: # BB#0:
@@ -3932,9 +3932,9 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhw:
; HASWELL: # BB#0:
@@ -3975,9 +3975,9 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmullw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
@@ -4027,7 +4027,7 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuludq:
; HASWELL: # BB#0:
@@ -4073,9 +4073,9 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_por:
; SANDY: # BB#0:
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
@@ -4126,9 +4126,9 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_psadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psadbw:
; HASWELL: # BB#0:
@@ -4176,9 +4176,9 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pshufd:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
-; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50]
+; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufd:
; HASWELL: # BB#0:
@@ -4226,10 +4226,10 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
;
; SANDY-LABEL: test_pshufhw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufhw:
; HASWELL: # BB#0:
@@ -4278,9 +4278,9 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pshuflw:
; SANDY: # BB#0:
; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshuflw:
; HASWELL: # BB#0:
@@ -4326,10 +4326,10 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pslld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslld:
; HASWELL: # BB#0:
@@ -4378,7 +4378,7 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; SANDY-LABEL: test_pslldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslldq:
; HASWELL: # BB#0:
@@ -4417,10 +4417,10 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_psllq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllq:
; HASWELL: # BB#0:
@@ -4468,10 +4468,10 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psllw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllw:
; HASWELL: # BB#0:
@@ -4519,10 +4519,10 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrad:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrad:
; HASWELL: # BB#0:
@@ -4570,10 +4570,10 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psraw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psraw:
; HASWELL: # BB#0:
@@ -4621,10 +4621,10 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrld:
; HASWELL: # BB#0:
@@ -4673,7 +4673,7 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; SANDY-LABEL: test_psrldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrldq:
; HASWELL: # BB#0:
@@ -4712,10 +4712,10 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_psrlq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlq:
; HASWELL: # BB#0:
@@ -4763,10 +4763,10 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psrlw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlw:
; HASWELL: # BB#0:
@@ -4816,8 +4816,8 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
@@ -4862,8 +4862,8 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psubd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
@@ -4904,8 +4904,8 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_psubq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
@@ -4950,8 +4950,8 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsb:
; HASWELL: # BB#0:
@@ -4997,8 +4997,8 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsw:
; HASWELL: # BB#0:
@@ -5044,8 +5044,8 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusb:
; HASWELL: # BB#0:
@@ -5091,8 +5091,8 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusw:
; HASWELL: # BB#0:
@@ -5138,8 +5138,8 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
@@ -5184,8 +5184,8 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_punpckhbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhbw:
; HASWELL: # BB#0:
@@ -5231,9 +5231,9 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckhdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhdq:
; HASWELL: # BB#0:
@@ -5279,10 +5279,10 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
;
; SANDY-LABEL: test_punpckhqdq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhqdq:
; HASWELL: # BB#0:
@@ -5330,8 +5330,8 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpckhwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhwd:
; HASWELL: # BB#0:
@@ -5375,9 +5375,9 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_punpcklbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklbw:
; HASWELL: # BB#0:
@@ -5423,9 +5423,9 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckldq:
; HASWELL: # BB#0:
@@ -5472,9 +5472,9 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; SANDY-LABEL: test_punpcklqdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50]
+; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklqdq:
; HASWELL: # BB#0:
@@ -5522,8 +5522,8 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpcklwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklwd:
; HASWELL: # BB#0:
@@ -5567,9 +5567,9 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pxor:
; SANDY: # BB#0:
; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
@@ -5616,9 +5616,9 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
@@ -5665,10 +5665,10 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [28:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
@@ -5720,11 +5720,11 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtsd:
; HASWELL: # BB#0:
@@ -5771,8 +5771,8 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
@@ -5813,8 +5813,8 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_subsd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subsd:
; HASWELL: # BB#0:
@@ -5879,16 +5879,16 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; SANDY-LABEL: test_ucomisd:
; SANDY: # BB#0:
; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomisd:
; HASWELL: # BB#0:
@@ -5950,9 +5950,9 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpckhpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
@@ -6005,9 +6005,9 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
@@ -6053,10 +6053,10 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_xorpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index 482b2fcab6425..ef1ddae4532d4 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -31,8 +31,8 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
@@ -74,8 +74,8 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
@@ -116,9 +116,9 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_haddpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
@@ -159,9 +159,9 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
@@ -202,9 +202,9 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
@@ -245,9 +245,9 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
@@ -287,8 +287,8 @@ define <16 x i8> @test_lddqu(i8* %a0) {
;
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
@@ -330,9 +330,9 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
@@ -380,9 +380,9 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
@@ -430,9 +430,9 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 340b9abe88797..1ab1598fcab7c 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -25,10 +25,10 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
;
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
@@ -65,9 +65,9 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
;
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
@@ -107,9 +107,9 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
@@ -150,9 +150,9 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
@@ -187,9 +187,9 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_dppd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dppd:
; HASWELL: # BB#0:
@@ -224,9 +224,9 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
@@ -262,8 +262,8 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
; SANDY-LABEL: test_insertps:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertps:
; HASWELL: # BB#0:
@@ -296,8 +296,8 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
;
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
@@ -328,9 +328,9 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_mpsadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mpsadbw:
; HASWELL: # BB#0:
@@ -367,8 +367,8 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packusdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packusdw:
; HASWELL: # BB#0:
@@ -411,8 +411,8 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; SANDY-LABEL: test_pblendvb:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendvb:
; HASWELL: # BB#0:
@@ -448,8 +448,8 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pblendw:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendw:
; HASWELL: # BB#0:
@@ -483,9 +483,9 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_pcmpeqq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqq:
; HASWELL: # BB#0:
@@ -521,9 +521,9 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
;
; SANDY-LABEL: test_pextrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrb:
; HASWELL: # BB#0:
@@ -558,9 +558,9 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
;
; SANDY-LABEL: test_pextrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrd:
; HASWELL: # BB#0:
@@ -594,9 +594,9 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
;
; SANDY-LABEL: test_pextrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrq:
; HASWELL: # BB#0:
@@ -630,9 +630,9 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
@@ -667,9 +667,9 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
;
; SANDY-LABEL: test_phminposuw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phminposuw:
; HASWELL: # BB#0:
@@ -704,9 +704,9 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
;
; SANDY-LABEL: test_pinsrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrb:
; HASWELL: # BB#0:
@@ -740,9 +740,9 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
;
; SANDY-LABEL: test_pinsrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrd:
; HASWELL: # BB#0:
@@ -778,10 +778,10 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
;
; SANDY-LABEL: test_pinsrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrq:
; HASWELL: # BB#0:
@@ -819,8 +819,8 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsb:
; HASWELL: # BB#0:
@@ -856,8 +856,8 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsd:
; HASWELL: # BB#0:
@@ -893,8 +893,8 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxud:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxud:
; HASWELL: # BB#0:
@@ -930,8 +930,8 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxuw:
; HASWELL: # BB#0:
@@ -967,8 +967,8 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsb:
; HASWELL: # BB#0:
@@ -1004,8 +1004,8 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsd:
; HASWELL: # BB#0:
@@ -1041,8 +1041,8 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminud:
; SANDY: # BB#0:
; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminud:
; HASWELL: # BB#0:
@@ -1078,8 +1078,8 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminuw:
; HASWELL: # BB#0:
@@ -1118,9 +1118,9 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbw:
; HASWELL: # BB#0:
@@ -1162,9 +1162,9 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbd:
; HASWELL: # BB#0:
@@ -1206,9 +1206,9 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbq:
; HASWELL: # BB#0:
@@ -1250,9 +1250,9 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovsxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxdq:
; HASWELL: # BB#0:
@@ -1294,9 +1294,9 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwd:
; HASWELL: # BB#0:
@@ -1338,9 +1338,9 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwq:
; HASWELL: # BB#0:
@@ -1382,9 +1382,9 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbw:
; HASWELL: # BB#0:
@@ -1426,9 +1426,9 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbd:
; HASWELL: # BB#0:
@@ -1470,9 +1470,9 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbq:
; HASWELL: # BB#0:
@@ -1514,9 +1514,9 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovzxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxdq:
; HASWELL: # BB#0:
@@ -1558,9 +1558,9 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwd:
; HASWELL: # BB#0:
@@ -1602,9 +1602,9 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwq:
; HASWELL: # BB#0:
@@ -1642,9 +1642,9 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmuldq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuldq:
; HASWELL: # BB#0:
@@ -1680,9 +1680,9 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmulld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulld:
; HASWELL: # BB#0:
@@ -1724,13 +1724,13 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_ptest:
; SANDY: # BB#0:
-; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: setb %cl # sched: [1:0.33]
+; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: setb %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ptest:
; HASWELL: # BB#0:
@@ -1778,9 +1778,9 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_roundpd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
@@ -1822,9 +1822,9 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_roundps:
; SANDY: # BB#0:
; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
@@ -1867,9 +1867,9 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; SANDY-LABEL: test_roundsd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundsd:
; HASWELL: # BB#0:
@@ -1912,9 +1912,9 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; SANDY-LABEL: test_roundss:
; SANDY: # BB#0:
; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundss:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index afc48bc57ee7d..7ce9ffdbd0ea1 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -26,9 +26,9 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; SANDY-LABEL: crc32_32_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_8:
; HASWELL: # BB#0:
@@ -68,9 +68,9 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; SANDY-LABEL: crc32_32_16:
; SANDY: # BB#0:
; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_16:
; HASWELL: # BB#0:
@@ -112,7 +112,7 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_32:
; HASWELL: # BB#0:
@@ -152,9 +152,9 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; SANDY-LABEL: crc32_64_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_8:
; HASWELL: # BB#0:
@@ -196,7 +196,7 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_64:
; HASWELL: # BB#0:
@@ -256,7 +256,7 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestri:
; HASWELL: # BB#0:
@@ -320,7 +320,7 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestrm:
; HASWELL: # BB#0:
@@ -369,12 +369,12 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistri:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistri:
; HASWELL: # BB#0:
@@ -416,9 +416,9 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistrm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistrm:
; HASWELL: # BB#0:
@@ -453,9 +453,9 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_pcmpgtq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtq:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
new file mode 100644
index 0000000000000..11afdb7989f15
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2
+
+define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
+; GENERIC-LABEL: test_extrq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: extrq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_extrq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: extrq %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>)
+
+define <2 x i64> @test_extrqi(<2 x i64> %a0) {
+; GENERIC-LABEL: test_extrqi:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: extrq $2, $3, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_extrqi:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: extrq $2, $3, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8)
+
+define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
+; GENERIC-LABEL: test_insertq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: insertq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_insertq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: insertq %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
+; GENERIC-LABEL: test_insertqi:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_insertqi:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8)
+
+define void @test_movntsd(i8* %p, <2 x double> %a) {
+; GENERIC-LABEL: test_movntsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movntsd %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_movntsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+ ret void
+}
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+
+define void @test_movntss(i8* %p, <4 x float> %a) {
+; GENERIC-LABEL: test_movntss:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movntss %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_movntss:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+ ret void
+}
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index 8b7a0c0ec02b6..f24969a30c337 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -35,9 +35,9 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; SANDY-LABEL: test_pabsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsb:
; HASWELL: # BB#0:
@@ -86,9 +86,9 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pabsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
@@ -136,7 +136,7 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pabsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
@@ -182,8 +182,8 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_palignr:
; SANDY: # BB#0:
; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
-; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_palignr:
; HASWELL: # BB#0:
@@ -223,9 +223,9 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phaddd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddd:
; HASWELL: # BB#0:
@@ -274,9 +274,9 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddsw:
; HASWELL: # BB#0:
@@ -317,9 +317,9 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddw:
; HASWELL: # BB#0:
@@ -360,9 +360,9 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phsubd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubd:
; HASWELL: # BB#0:
@@ -411,9 +411,9 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubsw:
; HASWELL: # BB#0:
@@ -454,9 +454,9 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubw:
; HASWELL: # BB#0:
@@ -497,9 +497,9 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pmaddubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddubsw:
; HASWELL: # BB#0:
@@ -538,8 +538,8 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhrsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhrsw:
; HASWELL: # BB#0:
@@ -579,8 +579,8 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pshufb:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufb:
; HASWELL: # BB#0:
@@ -630,8 +630,8 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psignb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignb:
; HASWELL: # BB#0:
@@ -681,8 +681,8 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psignd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignd:
; HASWELL: # BB#0:
@@ -732,8 +732,8 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psignw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignw:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/swizzle-avx2.ll b/test/CodeGen/X86/swizzle-avx2.ll
index 29dfa6c2dcc17..6ca9126eb09df 100644
--- a/test/CodeGen/X86/swizzle-avx2.ll
+++ b/test/CodeGen/X86/swizzle-avx2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s
; Test that we correctly fold a shuffle that performs a swizzle of another
; shuffle node according to the rule
@@ -11,81 +12,77 @@
; Check that we produce a single vector permute / shuffle in all cases.
define <8 x i32> @swizzle_1(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_1
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_2(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_2
-; CHECK: vpshufd $78
-; CHECK-NOT: vpermd
-; CHECK-NOT: vpshufd
-; CHECK: ret
-
define <8 x i32> @swizzle_3(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_3
-; CHECK: vpshufd $78
-; CHECK-NOT: vpermd
-; CHECK-NOT: vpshufd
-; CHECK: ret
-
define <8 x i32> @swizzle_4(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_4
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_5(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_5
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_6(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_6
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_7(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_7:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_7
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 80d36d5af4d2c..5ce6bbd4b49ea 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -1,253 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
-define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = lshr i32 %a, 4
- %1 = and i32 %0, 4095
- ret i32 %1
-}
-
-define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = load i32, i32* %a
- %1 = lshr i32 %0, 4
- %2 = and i32 %1, 4095
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = lshr i64 %a, 4
- %1 = and i64 %0, 4095
- ret i64 %1
-}
-
-define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = load i64, i64* %a
- %1 = lshr i64 %0, 4
- %2 = and i64 %1, 4095
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcfill_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcfill %
- %0 = add i32 %a, 1
- %1 = and i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcfill_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcfill %
- %0 = add i64 %a, 1
- %1 = and i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u32:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = add i32 1, %a
- %1 = xor i32 %0, -1
- %2 = or i32 %1, %a
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u64:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = add i64 1, %a
- %1 = xor i64 %0, -1
- %2 = or i64 %1, %a
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u32_b:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = sub i32 -2, %a
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u64_b:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = sub i64 -2, %a
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcic_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcic %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, 1
- %2 = and i32 %1, %0
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcic_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcic %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, 1
- %2 = and i64 %1, %0
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcmsk %
- %0 = add i32 %a, 1
- %1 = xor i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcmsk %
- %0 = add i64 %a, 1
- %1 = xor i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcs_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcs %
- %0 = add i32 %a, 1
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcs_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcs %
- %0 = add i64 %a, 1
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsfill_u32:
- ; CHECK-NOT: mov
- ; CHECK: blsfill %
- %0 = add i32 %a, -1
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsfill_u64:
- ; CHECK-NOT: mov
- ; CHECK: blsfill %
- %0 = add i64 %a, -1
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsic_u32:
- ; CHECK-NOT: mov
- ; CHECK: blsic %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, -1
- %2 = or i32 %0, %1
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsic_u64:
- ; CHECK-NOT: mov
- ; CHECK: blsic %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, -1
- %2 = or i64 %0, %1
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
- ; CHECK-NOT: mov
- ; CHECK: t1mskc %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, 1
- %2 = or i32 %0, %1
- ret i32 %2
-}
-
-define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_t1mskc_u64:
- ; CHECK-NOT: mov
- ; CHECK: t1mskc %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, 1
- %2 = or i64 %0, %1
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
- ; CHECK-NOT: mov
- ; CHECK: tzmsk %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, -1
- %2 = and i32 %0, %1
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
- ; CHECK-NOT: mov
- ; CHECK: tzmsk %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, -1
- %2 = and i64 %0, %1
- ret i64 %2
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = lshr i32 %a, 4
+ %t1 = and i32 %t0, 4095
+ ret i32 %t1
+}
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = load i32, i32* %a
+ %t1 = lshr i32 %t0, 4
+ %t2 = and i32 %t1, 4095
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = lshr i64 %a, 4
+ %t1 = and i64 %t0, 4095
+ ret i64 %t1
+}
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = load i64, i64* %a
+ %t1 = lshr i64 %t0, 4
+ %t2 = and i64 %t1, 4095
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcfill %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = and i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcfill %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = and i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 1, %a
+ %t1 = xor i32 %t0, -1
+ %t2 = or i32 %t1, %a
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 1, %a
+ %t1 = xor i64 %t0, -1
+ %t2 = or i64 %t1, %a
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32_b:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = sub i32 -2, %a
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64_b:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = sub i64 -2, %a
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcic %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = and i32 %t1, %t0
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcic %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = and i64 %t1, %t0
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcmsk %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = xor i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcmsk %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = xor i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcs %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcs %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsfill %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, -1
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsfill %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, -1
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsic %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = or i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsic %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = or i64 %t0, %t1
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: t1mskc %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = or i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind {
+; CHECK-LABEL: Ttest_x86_tbm_t1mskc_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: t1mskc %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = or i64 %t0, %t1
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzmsk %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = and i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzmsk %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = and i64 %t0, %t1
+ ret i64 %t2
}
+
diff --git a/test/CodeGen/X86/vec-copysign.ll b/test/CodeGen/X86/vec-copysign.ll
index d363dbdaef81f..1ebd7ceafced8 100644
--- a/test/CodeGen/X86/vec-copysign.ll
+++ b/test/CodeGen/X86/vec-copysign.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=CHECK
-; Assertions have been enhanced from utils/update_test_checks.py to show the constant pool values.
+; Assertions have been enhanced from utils/update_llc_test_checks.py to show the constant pool values.
; Use a macosx triple to make sure the format of those constant strings is exact.
; CHECK: [[SIGNMASK1:L.+]]:
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index f7fcd032cab36..556e32d0c87b9 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -1,16 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s
; Without any typed operations, always use the smaller xorps.
-; CHECK: test
-; CHECK: xorps
define <2 x double> @test() {
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retl
ret <2 x double> zeroinitializer
}
; Prefer a constant pool load here.
-; CHECK: test2
-; CHECK-NOT: shuf
-; CHECK: movaps {{.*}}{{CPI|__xmm@}}
define <4 x i32> @test2() nounwind {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,0]
+; CHECK-NEXT: retl
ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
}
+
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index b4a58deff2f8d..731760a4ea55e 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -153,14 +153,16 @@ define <32 x i16> @test7(<32 x i16> %a) {
;
; AVX2-LABEL: test7:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test7:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512-NEXT: retq
@@ -183,7 +185,8 @@ define <16 x i32> @test8(<16 x i32> %a) {
;
; AVX2-LABEL: test8:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 1c352782fca4f..745316effc98b 100644
--- a/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s
; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
; Subtracting zero is free.
define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; CHECK-LABEL: vec_fsub_zero:
-; CHECK-NOT: subps
-; CHECK-NOT: xorps
-; CHECK: retq
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%sub = fsub <4 x float> %x, zeroinitializer
ret <4 x float> %sub
}
@@ -15,9 +15,10 @@ define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; Negating doesn't require subtraction.
define <4 x float> @vec_fneg(<4 x float> %x) {
; CHECK-LABEL: vec_fneg:
-; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0
-; CHECK-NOT: subps
-; CHECK-NEXT: retq
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
%sub = fsub <4 x float> zeroinitializer, %x
ret <4 x float> %sub
}
+
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index adda108bdc777..d2f33785530b4 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -344,20 +344,43 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i16:
+; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
+; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
ret <8 x i16> %out
}
@@ -431,17 +454,37 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i8:
+; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
+; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index accbad35e9d72..4c5de2fed3852 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -155,17 +155,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX512VPOPCNTDQ-LABEL: testv16i16:
; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
ret <16 x i16> %out
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index aa50206e7a5ee..a6f4e33428973 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,11 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-LABEL: testv8i64:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -28,7 +29,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -42,7 +43,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
@@ -51,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-LABEL: testv16i32:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -82,7 +83,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -100,7 +101,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
@@ -109,7 +110,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -133,7 +134,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -147,36 +148,37 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
-; AVX512VPOPCNTDQ-LABEL: testv32i16:
-; AVX512VPOPCNTDQ: ## BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: retq
+; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
+; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: retq
+;
+; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
+; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-LABEL: testv64i8:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -194,7 +196,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -205,23 +207,35 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
-; AVX512VPOPCNTDQ-LABEL: testv64i8:
-; AVX512VPOPCNTDQ: ## BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: retq
+; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
+; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: retq
+;
+; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
+; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
new file mode 100644
index 0000000000000..af69a5ac22839
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; Combine tests involving SSE4A target shuffles (EXTRQI,INSERTQI)
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) {
+; ALL-LABEL: combine_extrqi_pshufb_16i8:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %2
+}
+
+define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) {
+; ALL-LABEL: combine_extrqi_pshufb_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %4 = bitcast <16 x i8> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSSE3-LABEL: combine_insertqi_pshufb_16i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE42-LABEL: combine_insertqi_pshufb_16i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: combine_insertqi_pshufb_16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %2
+}
+
+define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSSE3-LABEL: combine_insertqi_pshufb_8i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE42-LABEL: combine_insertqi_pshufb_8i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: combine_insertqi_pshufb_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %4 = bitcast <16 x i8> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: combine_pshufb_insertqi_pshufb:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %3
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 546b731260396..02314857c6d7e 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -445,6 +445,21 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
ret <16 x i8> %res1
}
+define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
+; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
+; AVX-NEXT: retq
+ %res0 = load <16 x i8>, <16 x i8> *%a0, align 16
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %res1
+}
+
define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
; SSE: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 138c421215f4f..e458bb6fa52ff 100644
--- a/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -1,4 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AMD10H
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
@@ -10,7 +11,6 @@
define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
; ALL-LABEL: extrqi_len0_idx0:
; ALL: # BB#0:
-; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0)
ret <2 x i64> %1
@@ -36,6 +36,11 @@ define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
}
define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -50,12 +55,17 @@ define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
}
define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
@@ -67,12 +77,17 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
}
define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
@@ -85,6 +100,11 @@ define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
}
define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_01zzuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -99,12 +119,17 @@ define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
}
define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_01zzzzzz23zzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
@@ -143,21 +168,37 @@ define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
}
define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
-; ALL-LABEL: shuf_012zuuuu:
-; ALL: # BB#0:
-; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; AMD10H-LABEL: shuf_012zuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
+; BTVER1-LABEL: shuf_012zuuuu:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_012zuuuu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; BTVER2-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %s
}
define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
+; AMD10H-LABEL: shuf_0zzz1zzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzz1zzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzz1zzz:
@@ -169,6 +210,12 @@ define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
}
define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
+; AMD10H-LABEL: shuf_0z1z:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: pxor %xmm1, %xmm1
+; AMD10H-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0z1z:
; BTVER1: # BB#0:
; BTVER1-NEXT: pxor %xmm1, %xmm1
@@ -189,10 +236,20 @@ define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
; A length of zero is equivalent to a bit length of 64.
define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: insertqi_len0_idx0:
-; ALL: # BB#0:
-; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7],xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; AMD10H-LABEL: insertqi_len0_idx0:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movaps %xmm1, %xmm0
+; AMD10H-NEXT: retq
+;
+; BTVER1-LABEL: insertqi_len0_idx0:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm1, %xmm0
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: insertqi_len0_idx0:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovaps %xmm1, %xmm0
+; BTVER2-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0)
ret <2 x i64> %1
}
@@ -303,6 +360,15 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
; Out of range.
define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
+; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0
+; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AMD10H-NEXT: packuswb %xmm0, %xmm0
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: psrld $16, %xmm1
@@ -321,6 +387,13 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
}
define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; AMD10H-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; BTVER1: # BB#0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
@@ -335,6 +408,12 @@ define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8
}
define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; AMD10H-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: psrlq $16, %xmm0
+; AMD10H-NEXT: pand {{.*}}(%rip), %xmm0
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; BTVER1: # BB#0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
diff --git a/test/CodeGen/X86/vector-truncate-combine.ll b/test/CodeGen/X86/vector-truncate-combine.ll
index 1a6dac8fa6e41..61808b802517d 100644
--- a/test/CodeGen/X86/vector-truncate-combine.ll
+++ b/test/CodeGen/X86/vector-truncate-combine.ll
@@ -11,14 +11,14 @@
; preservation of the extend/truncate operations mentioned above (2 extend and
; 3 truncate instructions).
;
-; NOTE: This operation could be collapsed in to a single truncate. Once that is done
-; this test will have to be adjusted.
+; NOTE: This operation is collapsed to a single truncate, so this test no longer covers
+; what it originally intended to.
-; CHECK: PUNPCKLBWrr
-; CHECK: PUNPCKLWDrr
-; CHECK: PACKUSWBrr
+; CHECK: MOVLHPSrr
+; CHECK: PSHUFHWri
; CHECK: PACKUSWBrr
; CHECK: PACKUSWBrr
+; CHECK: MOVPDI2DIrr
define void @test(double %vec.coerce) local_unnamed_addr {
entry:
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 4b5a00a30d097..820178d2d9927 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -928,17 +928,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv8i16:
@@ -1095,17 +1088,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv8i16u:
@@ -1243,14 +1229,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv16i8:
@@ -1384,14 +1366,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv16i8u:
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 16192ec61a550..30e5661d54859 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -584,17 +584,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-AVX-LABEL: testv16i16:
@@ -722,17 +714,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-AVX-LABEL: testv16i16u:
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 760216d561c4e..3bf677aadf195 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -34,7 +34,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -52,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -70,7 +70,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -84,7 +84,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -94,7 +94,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -104,7 +104,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -122,7 +122,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -136,7 +136,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -172,7 +172,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -194,7 +194,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -216,7 +216,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -230,7 +230,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -240,7 +240,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -250,7 +250,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -272,7 +272,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -286,7 +286,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -318,7 +318,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -338,7 +338,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -358,35 +358,21 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
@@ -394,7 +380,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -426,7 +412,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -446,7 +432,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -466,35 +452,21 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
@@ -502,7 +474,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -528,7 +500,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -545,7 +517,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -562,7 +534,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -592,7 +564,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -618,7 +590,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -635,7 +607,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -652,7 +624,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index b5c7f86567a13..182d7cc73c9aa 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -101,8 +101,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jge .LBB4_2
; CHECK-NEXT: # BB#1: # %bb1
; CHECK-NEXT: movl $1, %eax
diff --git a/test/CodeGen/X86/x32-lea-1.ll b/test/CodeGen/X86/x32-lea-1.ll
index 2f7d71e2baf1b..afe3581a85bce 100644
--- a/test/CodeGen/X86/x32-lea-1.ll
+++ b/test/CodeGen/X86/x32-lea-1.ll
@@ -1,10 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O0 | FileCheck %s
-; CHECK: leal {{[-0-9]*}}(%r{{s|b}}p),
-; CHECK-NOT: leal {{[-0-9]*}}(%e{{s|b}}p),
define void @foo(i32** %p) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: leal -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: addl $16, %eax
+; CHECK-NEXT: movl %eax, (%edi)
+; CHECK-NEXT: retq
%a = alloca i32, i32 10
%addr = getelementptr i32, i32* %a, i32 4
store i32* %addr, i32** %p
ret void
}
+
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 1263605a6dc03..5f85975fdb5ce 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1,9 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX3
define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_4:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
+; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_4:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -32,6 +49,21 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_2:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
+; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_2:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -54,6 +86,16 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_1:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -98,24 +140,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: load_factori64_4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: load_factori64_4:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX-NEXT: vmovdqu 64(%rdi), %ymm2
+; AVX-NEXT: vmovdqu 96(%rdi), %ymm3
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
%strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -128,6 +170,23 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
}
define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; AVX1-LABEL: store_factorf64_4:
+; AVX1: # BB#0:
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vmovupd %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovupd %ymm3, 64(%rdi)
+; AVX1-NEXT: vmovupd %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovupd %ymm2, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX-LABEL: store_factorf64_4:
; AVX: # BB#0:
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
@@ -169,22 +228,22 @@ define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: store_factori64_4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
-; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi)
-; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi)
-; AVX2-NEXT: vmovdqu %ymm2, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: store_factori64_4:
+; AVX: # BB#0:
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; AVX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX-NEXT: vmovdqu %ymm3, 64(%rdi)
+; AVX-NEXT: vmovdqu %ymm4, 32(%rdi)
+; AVX-NEXT: vmovdqu %ymm2, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -252,54 +311,54 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
-; AVX2-NEXT: vmovdqa %ymm4, 64(%rdi)
-; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
-; AVX2-NEXT: vmovdqa %ymm8, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: interleaved_store_vf32_i8_stride4:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX-NEXT: vmovdqa %ymm0, 96(%rdi)
+; AVX-NEXT: vmovdqa %ymm4, 64(%rdi)
+; AVX-NEXT: vmovdqa %ymm5, 32(%rdi)
+; AVX-NEXT: vmovdqa %ymm8, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
diff --git a/test/CodeGen/X86/zext-shl.ll b/test/CodeGen/X86/zext-shl.ll
index ac3ecc85f2d90..7722f46d753a6 100644
--- a/test/CodeGen/X86/zext-shl.ll
+++ b/test/CodeGen/X86/zext-shl.ll
@@ -1,25 +1,26 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
-define i32 @t1(i8 zeroext %x) nounwind readnone ssp {
-entry:
+define i32 @t1(i8 zeroext %x) nounwind {
; CHECK-LABEL: t1:
-; CHECK: shll
-; CHECK-NOT: movzwl
-; CHECK: ret
- %0 = zext i8 %x to i16
- %1 = shl i16 %0, 5
- %2 = zext i16 %1 to i32
- ret i32 %2
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $5, %eax
+; CHECK-NEXT: retl
+ %t0 = zext i8 %x to i16
+ %t1 = shl i16 %t0, 5
+ %t2 = zext i16 %t1 to i32
+ ret i32 %t2
}
-define i32 @t2(i8 zeroext %x) nounwind readnone ssp {
-entry:
+define i32 @t2(i8 zeroext %x) nounwind {
; CHECK-LABEL: t2:
-; CHECK: shrl
-; CHECK-NOT: movzwl
-; CHECK: ret
- %0 = zext i8 %x to i16
- %1 = lshr i16 %0, 3
- %2 = zext i16 %1 to i32
- ret i32 %2
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $3, %eax
+; CHECK-NEXT: retl
+ %t0 = zext i8 %x to i16
+ %t1 = lshr i16 %t0, 3
+ %t2 = zext i16 %t1 to i32
+ ret i32 %t2
}
diff --git a/test/CodeGen/X86/zext-trunc.ll b/test/CodeGen/X86/zext-trunc.ll
index 32afd6b96a8b7..e51a77abc92e1 100644
--- a/test/CodeGen/X86/zext-trunc.ll
+++ b/test/CodeGen/X86/zext-trunc.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; rdar://7570931
define i64 @foo(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: foo:
-; CHECK: leal
-; CHECK-NOT: movl
-; CHECK: ret
+; CHECK: # BB#0:
+; CHECK-NEXT: leal (%rdi,%rsi), %eax
+; CHECK-NEXT: retq
%c = add i64 %a, %b
%d = trunc i64 %c to i32
%e = zext i32 %d to i64
diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index 3d245e9d396d3..a55eec2782a6a 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll
@@ -35,7 +35,7 @@
; OBJ32: CodeViewDebugInfo [
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: CodeSize: 0x6
; OBJ32: DisplayName: f
; OBJ32: LinkageName: _f
@@ -94,13 +94,13 @@
; OBJ64: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: CodeSize: 0xE
; OBJ64: DisplayName: f
; OBJ64: LinkageName: f
; OBJ64: }
; OBJ64-NEXT: ProcEnd {
-; OBJ64-NEXT: }
+; OBJ64: }
; OBJ64-NEXT: ]
; OBJ64: FunctionLineTable [
; OBJ64-NEXT: Name: f
diff --git a/test/DebugInfo/COFF/cpp-mangling.ll b/test/DebugInfo/COFF/cpp-mangling.ll
index 8d1a136ec5fc1..6f8b5a21ffba6 100644
--- a/test/DebugInfo/COFF/cpp-mangling.ll
+++ b/test/DebugInfo/COFF/cpp-mangling.ll
@@ -12,12 +12,12 @@
; fn_tmpl<int, foo::bar>();
; }
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: FunctionType: bar ({{.*}})
; CHECK: DisplayName: foo::bar{{$}}
; CHECK-NEXT: LinkageName: ?bar@foo@@YAHH@Z
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: FunctionType: fn_tmpl ({{.*}})
; CHECK: DisplayName: foo::fn_tmpl<int,&foo::bar>
; CHECK-NEXT: LinkageName: ??$fn_tmpl@H$1?bar@foo@@YAHH@Z@foo@@YAXXZ
diff --git a/test/DebugInfo/COFF/fp-stack.ll b/test/DebugInfo/COFF/fp-stack.ll
index 4a30a49a3768e..8061e2ee23d1a 100644
--- a/test/DebugInfo/COFF/fp-stack.ll
+++ b/test/DebugInfo/COFF/fp-stack.ll
@@ -11,7 +11,7 @@ entry:
}
; ASM: .cv_def_range Lfunc_begin0 Lfunc_end0, "A\021\200\000\000\000"
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 128
; OBJ: MayHaveNoName: 0
; OBJ: LocalVariableAddrRange {
diff --git a/test/DebugInfo/COFF/globals.ll b/test/DebugInfo/COFF/globals.ll
index 0d1b9413e3d84..f5d6906e181e6 100644
--- a/test/DebugInfo/COFF/globals.ll
+++ b/test/DebugInfo/COFF/globals.ll
@@ -81,13 +81,13 @@
; OBJ: DisplayName: first
; OBJ: LinkageName: ?first@@3HA
; OBJ: }
-; OBJ: ThreadLocalDataSym {
+; OBJ: GlobalTLS {
; OBJ: DataOffset: ?middle@@3PEBHEB+0x0
; OBJ: Type: const int* (0x1001)
; OBJ: DisplayName: middle
; OBJ: LinkageName: ?middle@@3PEBHEB
; OBJ: }
-; OBJ: DataSym {
+; OBJ: GlobalData {
; OBJ: Kind: S_GDATA32 (0x110D)
; OBJ: DataOffset: ?last@@3HA+0x0
; OBJ: Type: int (0x74)
@@ -101,7 +101,7 @@
; OBJ: Magic: 0x4
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: DataSym {
+; OBJ: GlobalData {
; OBJ: DataOffset: ?comdat@?$A@X@@2HB+0x0
; OBJ: Type: const int (0x1000)
; OBJ: DisplayName: comdat
diff --git a/test/DebugInfo/COFF/inlining-files.ll b/test/DebugInfo/COFF/inlining-files.ll
index a6f5d281eb097..e3e616b618da5 100644
--- a/test/DebugInfo/COFF/inlining-files.ll
+++ b/test/DebugInfo/COFF/inlining-files.ll
@@ -18,10 +18,10 @@
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: DisplayName: f
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: file_change (0x1002)
diff --git a/test/DebugInfo/COFF/inlining-header.ll b/test/DebugInfo/COFF/inlining-header.ll
index 0981825e0d3b9..7e19f14716f0b 100644
--- a/test/DebugInfo/COFF/inlining-header.ll
+++ b/test/DebugInfo/COFF/inlining-header.ll
@@ -63,7 +63,7 @@
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: FunctionType: main (0x1005)
; OBJ: CodeOffset: _main+0x0
@@ -74,8 +74,8 @@
; OBJ: LinkageName: _main
; OBJ: }
-; Previously, g's InlineSite referenced t.h, which was wasteful.
-; OBJ: InlineSite {
+; Previously, g's InlineSiteSym referenced t.h, which was wasteful.
+; OBJ: InlineSiteSym {
; OBJ: Inlinee: g (0x1002)
; OBJ: BinaryAnnotations [
; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x6, LineOffset: 1}
@@ -85,7 +85,7 @@
; OBJ-NEXT: ]
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: Inlinee: f (0x1003)
; OBJ: BinaryAnnotations [
; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xD, LineOffset: 1}
diff --git a/test/DebugInfo/COFF/inlining-levels.ll b/test/DebugInfo/COFF/inlining-levels.ll
index 0c5c73c8fdbe5..7f93dbb850a2f 100644
--- a/test/DebugInfo/COFF/inlining-levels.ll
+++ b/test/DebugInfo/COFF/inlining-levels.ll
@@ -18,14 +18,14 @@
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
-; OBJ: InlineSite {
+; OBJ: {{.*}}Proc{{.*}}Sym {
+; OBJ: InlineSiteSym {
; OBJ: Inlinee: h (0x1002)
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: Inlinee: g (0x1003)
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: Inlinee: f (0x1004)
; OBJ: }
; OBJ: InlineSiteEnd {
diff --git a/test/DebugInfo/COFF/inlining-same-name.ll b/test/DebugInfo/COFF/inlining-same-name.ll
index 4a9c9924135d2..3700b7060a7a9 100644
--- a/test/DebugInfo/COFF/inlining-same-name.ll
+++ b/test/DebugInfo/COFF/inlining-same-name.ll
@@ -14,15 +14,15 @@
; CHECK: CodeViewDebugInfo [
; CHECK: Section: .debug$S
; CHECK: Subsection [
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: DisplayName: main
; CHECK: }
-; CHECK: InlineSite {
+; CHECK: InlineSiteSym {
; CHECK: Inlinee: same_name (0x1002)
; CHECK: }
; CHECK: InlineSiteEnd {
; CHECK: }
-; CHECK: InlineSite {
+; CHECK: InlineSiteSym {
; CHECK: Inlinee: same_name (0x1002)
; CHECK: }
; CHECK: InlineSiteEnd {
diff --git a/test/DebugInfo/COFF/inlining.ll b/test/DebugInfo/COFF/inlining.ll
index 76b8f8c88ee2d..ddfd5e056a1b9 100644
--- a/test/DebugInfo/COFF/inlining.ll
+++ b/test/DebugInfo/COFF/inlining.ll
@@ -166,7 +166,7 @@
; OBJ: ]
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: PtrNext: 0x0
@@ -181,7 +181,7 @@
; OBJ: DisplayName: baz
; OBJ: LinkageName: ?baz@@YAXXZ
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: bar (0x1002)
@@ -193,7 +193,7 @@
; OBJ-NEXT: ChangeCodeLength: 0x7
; OBJ: ]
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: foo (0x1003)
diff --git a/test/DebugInfo/COFF/int8-char-type.ll b/test/DebugInfo/COFF/int8-char-type.ll
index 82972a4528196..2e4395b4a599d 100644
--- a/test/DebugInfo/COFF/int8-char-type.ll
+++ b/test/DebugInfo/COFF/int8-char-type.ll
@@ -5,7 +5,7 @@
; DW_ATE_[un]signed encoding for all integer types if they don't have distinct
; integer types for characters types. This was PR30552.
-; CHECK-LABEL: DataSym {
+; CHECK-LABEL: GlobalData {
; CHECK-NEXT: Kind: S_GDATA32 (0x110D)
; CHECK-NEXT: DataOffset:
; CHECK-NEXT: Type: signed char (0x10)
@@ -13,7 +13,7 @@
; CHECK-NEXT: LinkageName: x
; CHECK-NEXT: }
-; CHECK-LABEL: DataSym {
+; CHECK-LABEL: GlobalData {
; CHECK-NEXT: Kind: S_GDATA32 (0x110D)
; CHECK-NEXT: DataOffset:
; CHECK-NEXT: Type: unsigned char (0x20)
diff --git a/test/DebugInfo/COFF/local-constant.ll b/test/DebugInfo/COFF/local-constant.ll
index bf8ba8446a6d9..c99dd32e22e48 100644
--- a/test/DebugInfo/COFF/local-constant.ll
+++ b/test/DebugInfo/COFF/local-constant.ll
@@ -11,10 +11,11 @@
; FIXME: Find a way to describe variables optimized to constants.
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: DisplayName: constant_var
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
+; OBJ-NEXT: Kind:
; OBJ-NEXT: Type: int (0x74)
; OBJ-NEXT: Flags [ (0x100)
; OBJ-NEXT: IsOptimizedOut (0x100)
diff --git a/test/DebugInfo/COFF/local-variable-gap.ll b/test/DebugInfo/COFF/local-variable-gap.ll
index a2d05eaa03e41..ab38bbd8c13f8 100644
--- a/test/DebugInfo/COFF/local-variable-gap.ll
+++ b/test/DebugInfo/COFF/local-variable-gap.ll
@@ -66,12 +66,13 @@
; ASM: .short 2 # Record length
; ASM: .short 4431 # Record kind: S_PROC_ID_END
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: VarName: p
; OBJ: }
-; OBJ-NOT: Local {
-; OBJ: DefRangeRegister {
+; OBJ-NOT: LocalSym {
+; OBJ: DefRangeRegisterSym {
+; OBJ-NEXT: Kind:
; OBJ-NEXT: Register: 23
; OBJ-NEXT: MayHaveNoName: 0
; OBJ-NEXT: LocalVariableAddrRange {
diff --git a/test/DebugInfo/COFF/local-variables.ll b/test/DebugInfo/COFF/local-variables.ll
index 249b6e1103dba..f7087f76f4c1c 100644
--- a/test/DebugInfo/COFF/local-variables.ll
+++ b/test/DebugInfo/COFF/local-variables.ll
@@ -99,18 +99,18 @@
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: DisplayName: f
; OBJ: LinkageName: f
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: param
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: No
; OBJ: OffsetInParent: 0
@@ -121,13 +121,13 @@
; OBJ: Range: 0x4F
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: a
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: No
; OBJ: OffsetInParent: 0
@@ -138,13 +138,13 @@
; OBJ: Range: 0x21
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: b
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: No
; OBJ: OffsetInParent: 0
@@ -155,7 +155,7 @@
; OBJ: Range: 0x1F
; OBJ: }
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: will_be_inlined (0x1002)
@@ -166,13 +166,13 @@
; OBJ: ChangeCodeLength: 0xC
; OBJ: ]
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: v
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: No
; OBJ: OffsetInParent: 0
@@ -185,7 +185,7 @@
; OBJ: }
; OBJ: InlineSiteEnd {
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: will_be_inlined (0x1002)
@@ -196,13 +196,13 @@
; OBJ: ChangeCodeLength: 0xA
; OBJ: ]
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: v
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: No
; OBJ: OffsetInParent: 0
diff --git a/test/DebugInfo/COFF/long-name.ll b/test/DebugInfo/COFF/long-name.ll
index 998d77f7ca06d..65bd4c16f7508 100644
--- a/test/DebugInfo/COFF/long-name.ll
+++ b/test/DebugInfo/COFF/long-name.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -filetype=obj | llvm-readobj -codeview | FileCheck %s
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: Kind: S_GPROC32_ID (0x1147)
; CHECK: FunctionType: {{A+}} (0x1002)
; CHECK: CodeOffset: f+0x0
diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll
index 5e53fa57acc47..8af99a6063e66 100644
--- a/test/DebugInfo/COFF/multifile.ll
+++ b/test/DebugInfo/COFF/multifile.ll
@@ -43,13 +43,13 @@
; OBJ32: ]
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: CodeSize: 0x10
; OBJ32: DisplayName: f
; OBJ32: LinkageName: _f
; OBJ32: }
; OBJ32-NEXT: ProcEnd {
-; OBJ32-NEXT: }
+; OBJ32: }
; OBJ32-NEXT: ]
; OBJ32: FunctionLineTable [
; OBJ32-NEXT: Name: _f
@@ -115,13 +115,13 @@
; OBJ64: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: CodeSize: 0x18
; OBJ64: DisplayName: f
; OBJ64: LinkageName: f
; OBJ64: }
; OBJ64-NEXT: ProcEnd {
-; OBJ64-NEXT: }
+; OBJ64: }
; OBJ64-NEXT: ]
; OBJ64: FunctionLineTable [
; OBJ64-NEXT: Name: f
diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index a6290e8f021da..87db2a20eaa6c 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll
@@ -145,7 +145,7 @@
; OBJ32: ]
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: Kind: S_LPROC32_ID (0x1146)
; OBJ32: CodeSize: 0x6
; OBJ32: DisplayName: x
@@ -159,7 +159,7 @@
; OBJ32: ]
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: Kind: S_GPROC32_ID (0x1147)
; OBJ32: CodeSize: 0x6
; OBJ32: DisplayName: y
@@ -173,7 +173,7 @@
; OBJ32: ]
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: Kind: S_GPROC32_ID (0x1147)
; OBJ32: CodeSize: 0x10
; OBJ32: DisplayName: f
@@ -419,7 +419,7 @@
; OBJ64-NEXT: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: Kind: S_LPROC32_ID (0x1146)
; OBJ64: CodeSize: 0xE
; OBJ64: DisplayName: x
@@ -433,7 +433,7 @@
; OBJ64: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: Kind: S_GPROC32_ID (0x1147)
; OBJ64: CodeSize: 0xE
; OBJ64: DisplayName: y
@@ -447,7 +447,7 @@
; OBJ64: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: Kind: S_GPROC32_ID (0x1147)
; OBJ64: CodeSize: 0x18
; OBJ64: DisplayName: f
diff --git a/test/DebugInfo/COFF/pieces.ll b/test/DebugInfo/COFF/pieces.ll
index 60330e0577267..098f2ae62f0b1 100644
--- a/test/DebugInfo/COFF/pieces.ll
+++ b/test/DebugInfo/COFF/pieces.ll
@@ -105,21 +105,21 @@
; ASM: .cv_def_range [[oy_start]] [[oy_end]], "C\021\027\000\000\000\004\000\000\000"
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: loop_csr
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: o
; OBJ: }
-; OBJ: DefRangeSubfieldRegister {
+; OBJ: DefRangeSubfieldRegisterSym {
; OBJ: Register: 24
; OBJ: MayHaveNoName: 0
; OBJ: OffsetInParent: 0
; OBJ: LocalVariableAddrRange {
; OBJ: }
; OBJ: }
-; OBJ: DefRangeSubfieldRegister {
+; OBJ: DefRangeSubfieldRegisterSym {
; OBJ: Register: 23
; OBJ: MayHaveNoName: 0
; OBJ: OffsetInParent: 4
@@ -135,14 +135,14 @@
; ASM: .asciz "o"
; ASM: .cv_def_range .Lfunc_begin1 .Lfunc_end1, "C\021\022\000\000\000\004\000\000\000"
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: pad_right
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: o
; OBJ: }
-; OBJ: DefRangeSubfieldRegister {
+; OBJ: DefRangeSubfieldRegisterSym {
; OBJ: Register: 18
; OBJ: MayHaveNoName: 0
; OBJ: OffsetInParent: 4
@@ -158,14 +158,14 @@
; ASM: .asciz "o"
; ASM: .cv_def_range .Lfunc_begin2 .Lfunc_end2, "C\021\022\000\000\000\000\000\000\000"
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: pad_left
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: o
; OBJ: }
-; OBJ: DefRangeSubfieldRegister {
+; OBJ: DefRangeSubfieldRegisterSym {
; OBJ: Register: 18
; OBJ: MayHaveNoName: 0
; OBJ: OffsetInParent: 0
@@ -185,17 +185,17 @@
; ASM: .asciz "p"
; ASM: .cv_def_range [[p_start]] .Lfunc_end3, "C\021\021\000\000\000\004\000\000\000"
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: nested
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: o
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: p
; OBJ: }
-; OBJ: DefRangeSubfieldRegister {
+; OBJ: DefRangeSubfieldRegisterSym {
; OBJ: Register: 17
; OBJ: MayHaveNoName: 0
; OBJ: OffsetInParent: 4
@@ -212,14 +212,14 @@
; ASM: .asciz "o"
; ASM: .cv_def_range [[spill_o_x_start]] [[spill_o_x_end]], "E\021O\001A\000$\000\000\000"
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: bitpiece_spill
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: VarName: o
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 335
; OBJ: HasSpilledUDTMember: Yes
; OBJ: OffsetInParent: 4
diff --git a/test/DebugInfo/COFF/register-variables.ll b/test/DebugInfo/COFF/register-variables.ll
index d0ca5ca2afadd..f8cd5c4fc3c19 100644
--- a/test/DebugInfo/COFF/register-variables.ll
+++ b/test/DebugInfo/COFF/register-variables.ll
@@ -81,17 +81,17 @@
; OBJ: Subsection [
; OBJ: SubSectionType: Symbols (0xF1)
-; OBJ: ProcStart {
+; OBJ: {{.*}}Proc{{.*}}Sym {
; OBJ: DisplayName: f
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: p
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 18
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0x0
@@ -99,7 +99,7 @@
; OBJ: Range: 0x7
; OBJ: }
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 23
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0x7
@@ -107,13 +107,13 @@
; OBJ: Range: 0x18
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: a
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 17
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0xC
@@ -121,13 +121,13 @@
; OBJ: Range: 0x6
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: c
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 17
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0xC
@@ -135,13 +135,13 @@
; OBJ: Range: 0x4
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: b
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 17
; OBJ: MayHaveNoName: 0
; OBJ: OffsetStart: .text+0x12
@@ -149,19 +149,19 @@
; OBJ: Range: 0x6
; OBJ: }
; OBJ: }
-; OBJ: InlineSite {
+; OBJ: InlineSiteSym {
; OBJ: PtrParent: 0x0
; OBJ: PtrEnd: 0x0
; OBJ: Inlinee: inlineinc (0x1002)
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: a
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 17
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0xC
@@ -169,13 +169,13 @@
; OBJ: Range: 0x6
; OBJ: }
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x0)
; OBJ: ]
; OBJ: VarName: b
; OBJ: }
-; OBJ: DefRangeRegister {
+; OBJ: DefRangeRegisterSym {
; OBJ: Register: 17
; OBJ: LocalVariableAddrRange {
; OBJ: OffsetStart: .text+0x12
diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 3a0b1c9fa7cd6..50d121be6942c 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll
@@ -77,13 +77,13 @@
; OBJ32-NEXT: ]
; OBJ32: Subsection [
; OBJ32-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ32: ProcStart {
+; OBJ32: {{.*}}Proc{{.*}}Sym {
; OBJ32: CodeSize: 0x6
; OBJ32: DisplayName: f
; OBJ32: LinkageName: _f
; OBJ32: }
; OBJ32-NEXT: ProcEnd {
-; OBJ32-NEXT: }
+; OBJ32: }
; OBJ32-NEXT: ]
; OBJ32: FunctionLineTable [
; OBJ32-NEXT: Name: _f
@@ -174,13 +174,13 @@
; OBJ64-NEXT: ]
; OBJ64: Subsection [
; OBJ64-NEXT: SubSectionType: Symbols (0xF1)
-; OBJ64: ProcStart {
+; OBJ64: {{.*}}Proc{{.*}}Sym {
; OBJ64: CodeSize: 0xE
; OBJ64: DisplayName: f
; OBJ64: LinkageName: f
; OBJ64: }
; OBJ64-NEXT: ProcEnd {
-; OBJ64-NEXT: }
+; OBJ64: }
; OBJ64-NEXT: ]
; OBJ64: FunctionLineTable [
; OBJ64-NEXT: Name: f
diff --git a/test/DebugInfo/COFF/typedef.ll b/test/DebugInfo/COFF/typedef.ll
index cf4e3df257de6..9d841419c561a 100644
--- a/test/DebugInfo/COFF/typedef.ll
+++ b/test/DebugInfo/COFF/typedef.ll
@@ -2,7 +2,7 @@
; CHECK: CodeViewDebugInfo [
; CHECK: Subsection [
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: wchar_t (0x71)
; CHECK: Flags [ (0x0)
; CHECK: ]
@@ -10,7 +10,7 @@
; CHECK: }
; CHECK: Subsection [
; CHECK: SubSectionType: Symbols (0xF1)
-; CHECK: UDT {
+; CHECK: UDTSym {
; CHECK: Type: wchar_t (0x71)
; CHECK: UDTName: XYZ
; CHECK: }
diff --git a/test/DebugInfo/COFF/types-array.ll b/test/DebugInfo/COFF/types-array.ll
index dca3884b1d099..1a4afa8bd2195 100644
--- a/test/DebugInfo/COFF/types-array.ll
+++ b/test/DebugInfo/COFF/types-array.ll
@@ -46,7 +46,7 @@
; CHECK: Magic: 0x4
; CHECK: Subsection [
; CHECK: SubSectionType: Symbols (0xF1)
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: PtrParent: 0x0
; CHECK: PtrEnd: 0x0
; CHECK: PtrNext: 0x0
@@ -61,13 +61,13 @@
; CHECK: DisplayName: f
; CHECK: LinkageName: ?f@@YAXXZ
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: 0x1003
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: a
; CHECK: }
-; CHECK: DefRangeRegisterRel {
+; CHECK: DefRangeRegisterRelSym {
; CHECK: BaseRegister: 22
; CHECK: HasSpilledUDTMember: No
; CHECK: OffsetInParent: 0
diff --git a/test/DebugInfo/COFF/types-basic.ll b/test/DebugInfo/COFF/types-basic.ll
index 4ead4bfc1c4cf..4b9fcd864c276 100644
--- a/test/DebugInfo/COFF/types-basic.ll
+++ b/test/DebugInfo/COFF/types-basic.ll
@@ -218,7 +218,7 @@
; CHECK: CodeViewDebugInfo [
; CHECK: Subsection [
; CHECK: SubSectionType: Symbols (0xF1)
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: DbgStart: 0x0
; CHECK: DbgEnd: 0x0
; CHECK: FunctionType: f (0x1002)
@@ -229,68 +229,68 @@
; CHECK: DisplayName: f
; CHECK: LinkageName: ?f@@YAXMN_J@Z
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: float (0x40)
; CHECK: Flags [ (0x1)
; CHECK: IsParameter (0x1)
; CHECK: ]
; CHECK: VarName: p1
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: double (0x41)
; CHECK: Flags [ (0x1)
; CHECK: IsParameter (0x1)
; CHECK: ]
; CHECK: VarName: p2
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: __int64 (0x13)
; CHECK: Flags [ (0x1)
; CHECK: IsParameter (0x1)
; CHECK: ]
; CHECK: VarName: p3
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: int (0x74)
; CHECK: VarName: v1
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: int* (0x674)
; CHECK: VarName: v2
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: const int* (0x1004)
; CHECK: VarName: v21
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: void* (0x603)
; CHECK: VarName: v3
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: int A::* (0x1006)
; CHECK: VarName: v4
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: void A::() A::* (0x100E)
; CHECK: VarName: v5
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: long (0x12)
; CHECK: VarName: l1
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: long (0x12)
; CHECK: VarName: l2
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: unsigned long (0x22)
; CHECK: VarName: l3
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: unsigned long (0x22)
; CHECK: VarName: l4
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: const void* (0x1010)
; CHECK: VarName: v6
; CHECK: }
@@ -298,48 +298,48 @@
; CHECK: }
; CHECK: ]
; CHECK: Subsection [
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: Type: CharTypes (0x1012)
; CHECK: DisplayName: CharTypes
; CHECK: LinkageName: ?CharTypes@@YAXXZ
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: wchar_t (0x71)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: w
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: unsigned short (0x21)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: us
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: char (0x70)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: c
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: unsigned char (0x20)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: uc
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: signed char (0x10)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: sc
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: char16_t (0x7A)
; CHECK: Flags [ (0x0)
; CHECK: ]
; CHECK: VarName: c16
; CHECK: }
-; CHECK: Local {
+; CHECK: LocalSym {
; CHECK: Type: char32_t (0x7B)
; CHECK: Flags [ (0x0)
; CHECK: ]
diff --git a/test/DebugInfo/COFF/udts.ll b/test/DebugInfo/COFF/udts.ll
index abc688d70a61a..735901f7571cb 100644
--- a/test/DebugInfo/COFF/udts.ll
+++ b/test/DebugInfo/COFF/udts.ll
@@ -18,37 +18,39 @@ target triple = "i686-pc-windows-msvc18.0.0"
; typedef struct { int x; } U;
; U u;
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: DisplayName: f
; CHECK: LinkageName: ?f@@YAXXZ
; CHECK: }
-; CHECK: UDT {
+; CHECK: UDTSym {
+; CHECK-NEXT: Kind: S_UDT (0x1108)
; CHECK-NEXT: Type: int (0x74)
; CHECK-NEXT: UDTName: f::FOO
; CHECK-NEXT: }
; CHECK-NEXT: ProcEnd {
-; CHECK-NEXT: }
-; CHECK: ProcStart {
+; CHECK: {{.*}}Proc{{.*}}Sym {
; CHECK: DisplayName: g
; CHECK: LinkageName: ?g@@YAMPEAUS@@@Z
; CHECK: }
-; CHECK: UDT {
+; CHECK: UDTSym {
+; CHECK-NEXT: Kind: S_UDT (0x1108)
; CHECK-NEXT: Type: g::pun (0x{{[0-9A-F]+}})
; CHECK-NEXT: UDTName: g::pun
; CHECK-NEXT: }
; CHECK-NEXT: ProcEnd {
-; CHECK-NEXT: }
; CHECK: Subsection
-; CHECK-NOT: ProcStart
-; CHECK: UDT {
+; CHECK-NOT: {{.*}}Proc{{.*}}Sym
+; CHECK: UDTSym {
+; CHECK-NEXT: Kind: S_UDT (0x1108)
; CHECK-NEXT: Type: S (0x{{[0-9A-F]+}})
; CHECK-NEXT: UDTName: S
-; CHECK: UDT {
+; CHECK: UDTSym {
+; CHECK-NEXT: Kind: S_UDT (0x1108)
; CHECK-NEXT: Type: <unnamed-tag> (0x{{[0-9A-F]+}})
; CHECK-NEXT: UDTName: U
-; CHECK-NOT: UDT {
+; CHECK-NOT: UDTSym {
%struct.U = type { i32 }
%struct.S = type { i32 }
diff --git a/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o
new file mode 100644
index 0000000000000..c0ed489d846c7
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o
Binary files differ
diff --git a/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s
new file mode 100644
index 0000000000000..9ee9ad234d84c
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s
@@ -0,0 +1,201 @@
+# Test object to verify dwarfdump handles v5 string offset tables in Mach-O.
+# This is similar to dwarfdump-str-offsets.s with 2 CUs and 1 TU, but no
+# split sections.
+#
+# To generate the test object:
+# llvm-mc -triple i386-apple-darwin9 dwarfdump-str-offsets-macho.s -filetype=obj \
+# -o dwarfdump-str-offsets-macho.o
+
+ .section __DWARF,__debug_str,regular,debug
+Linfo_string:
+ .asciz "Handmade DWARF producer"
+str_CU1:
+ .asciz "Compile_Unit_1"
+str_CU1_dir:
+ .asciz "/home/test/CU1"
+str_CU2:
+ .asciz "Compile_Unit_2"
+str_CU2_dir:
+ .asciz "/home/test/CU2"
+str_TU:
+ .asciz "Type_Unit"
+str_TU_type:
+ .asciz "MyStruct"
+str_Subprogram:
+ .asciz "MyFunc"
+str_Variable1:
+ .asciz "MyVar1"
+str_Variable2:
+ .asciz "MyVar2"
+str_Variable3:
+ .asciz "MyVar3"
+
+ .section __DWARF,__debug_str_offs,regular,debug
+Ldebug_str_offsets:
+ .long Ldebug_str_offsets_segment0_end-Ldebug_str_offsets_base0
+ .short 5 # DWARF version
+ .short 0 # Padding
+Ldebug_str_offsets_base0:
+ .long str_producer
+ .long str_CU1
+ .long str_CU1_dir
+ .long str_Subprogram
+ .long str_Variable1
+ .long str_Variable2
+ .long str_Variable3
+Ldebug_str_offsets_segment0_end:
+# CU2's contribution
+ .long Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1
+ .short 5 # DWARF version
+ .short 0 # Padding
+Ldebug_str_offsets_base1:
+ .long str_producer
+ .long str_CU2
+ .long str_CU2_dir
+Ldebug_str_offsets_segment1_end:
+# The TU's contribution
+ .long Ldebug_str_offsets_segment2_end-Ldebug_str_offsets_base2
+ .short 5 # DWARF version
+ .short 0 # Padding
+Ldebug_str_offsets_base2:
+ .long str_TU
+ .long str_TU_type
+Ldebug_str_offsets_segment2_end:
+
+ .section __DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+ .byte 0x01 # Abbrev code
+ .byte 0x11 # DW_TAG_compile_unit
+ .byte 0x01 # DW_CHILDREN_yes
+ .byte 0x25 # DW_AT_producer
+ .byte 0x1a # DW_FORM_strx
+ .byte 0x03 # DW_AT_name
+ .byte 0x1a # DW_FORM_strx
+ .byte 0x72 # DW_AT_str_offsets_base
+ .byte 0x17 # DW_FORM_sec_offset
+ .byte 0x1b # DW_AT_comp_dir
+ .byte 0x1a # DW_FORM_strx
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x02 # Abbrev code
+ .byte 0x41 # DW_TAG_type_unit
+ .byte 0x01 # DW_CHILDREN_yes
+ .byte 0x03 # DW_AT_name
+ .byte 0x1a # DW_FORM_strx
+ .byte 0x72 # DW_AT_str_offsets_base
+ .byte 0x17 # DW_FORM_sec_offset
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x03 # Abbrev code
+ .byte 0x13 # DW_TAG_structure_type
+ .byte 0x00 # DW_CHILDREN_no (no members)
+ .byte 0x03 # DW_AT_name
+ .byte 0x1a # DW_FORM_strx
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x04 # Abbrev code
+ .byte 0x2e # DW_TAG_subprogram
+ .byte 0x01 # DW_CHILDREN_yes
+ .byte 0x03 # DW_AT_name
+ .byte 0x25 # DW_FORM_strx1
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x05 # Abbrev code
+ .byte 0x34 # DW_TAG_variable
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x03 # DW_AT_name
+ .byte 0x26 # DW_FORM_strx2
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x06 # Abbrev code
+ .byte 0x34 # DW_TAG_variable
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x03 # DW_AT_name
+ .byte 0x27 # DW_FORM_strx3
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x07 # Abbrev code
+ .byte 0x34 # DW_TAG_variable
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x03 # DW_AT_name
+ .byte 0x28 # DW_FORM_strx4
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x00 # EOM(3)
+
+ .section __DWARF,__debug_info,regular,debug
+Lsection_info:
+# DWARF v5 CU header.
+ .long CU1_5_end-CU1_5_version # Length of Unit
+CU1_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long 0 # Offset Into Abbrev. Section
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
+# DW_AT_str_offsets and DW_AT_compdir.
+ .byte 1 # Abbreviation code
+ .byte 0 # The index of the producer string
+ .byte 1 # The index of the CU name string
+ .long Ldebug_str_offsets_base0-Ldebug_str_offsets
+ .byte 2 # The index of the comp dir string
+# A subprogram DIE with DW_AT_name, using DW_FORM_strx1.
+ .byte 4 # Abbreviation code
+ .byte 3 # Subprogram name string (DW_FORM_strx1)
+# A variable DIE with DW_AT_name, using DW_FORM_strx2.
+ .byte 5 # Abbreviation code
+ .short 0x0004 # Subprogram name string (DW_FORM_strx2)
+# A variable DIE with DW_AT_name, using DW_FORM_strx3.
+ .byte 6 # Abbreviation code
+ .byte 5 # Subprogram name string (DW_FORM_strx3)
+ .short 0 # Subprogram name string (DW_FORM_strx3)
+# A variable DIE with DW_AT_name, using DW_FORM_strx4.
+ .byte 7 # Abbreviation code
+ .quad 0x00000006 # Subprogram name string (DW_FORM_strx4)
+ .byte 0 # NULL
+ .byte 0 # NULL
+ .byte 0 # NULL
+CU1_5_end:
+
+# DWARF v5 CU header
+ .long CU2_5_end-CU2_5_version # Length of Unit
+CU2_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long 0 # Offset Into Abbrev. Section
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
+# DW_AT_str_offsets and DW_AT_compdir.
+ .byte 1 # Abbreviation code
+ .byte 0 # The index of the producer string
+ .byte 1 # The index of the CU name string
+ .long Ldebug_str_offsets_base1-Ldebug_str_offsets
+ .byte 2 # The index of the comp dir string
+ .byte 0 # NULL
+CU2_5_end:
+
+ .section __DWARF,__debug_types,regular,debug
+# DWARF v5 Type unit header.
+TU_5_start:
+ .long TU_5_end-TU_5_version # Length of Unit
+TU_5_version:
+ .short 5 # DWARF version number
+ .byte 2 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long 0 # Offset Into Abbrev. Section
+ .quad 0x0011223344556677 # Type Signature
+ .long TU_5_type-TU_5_start # Type offset
+# The type-unit DIE, which has a name.
+ .byte 2 # Abbreviation code
+ .byte 0 # Index of the unit type name string
+ .long Ldebug_str_offsets_base2-Ldebug_str_offsets # offset into the str_offsets section
+# The type DIE, which has a name.
+TU_5_type:
+ .byte 3 # Abbreviation code
+ .byte 1 # Index of the type name string
+ .byte 0 # NULL
+ .byte 0 # NULL
+TU_5_end:
+
+
+.subsections_via_symbols
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 space b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64-space
index 7330cd8baa1e9..7330cd8baa1e9 100755
--- a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 space
+++ b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64-space
Binary files differ
diff --git a/test/DebugInfo/PDB/Inputs/every-type.cpp b/test/DebugInfo/PDB/Inputs/every-type.cpp
new file mode 100644
index 0000000000000..ed715b0343001
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/every-type.cpp
@@ -0,0 +1,63 @@
+// Build with "cl.exe /Zi /GR- /GX- every-type.cpp /link /debug /nodefaultlib /entry:main"
+
+// clang-format off
+void *__purecall = 0;
+
+void __cdecl operator delete(void *,unsigned int) {}
+
+struct FooStruct { }; // LF_STRUCTURE
+
+class FooClass { // LF_CLASS
+ // LF_FIELDLIST
+ enum NestedEnum { // LF_ENUM
+ // LF_NESTTYPE
+ A, B, C // LF_ENUMERATE
+ };
+
+ void RegularMethod() {} // LF_ARGLIST
+ // LF_ONEMETHOD
+ // LF_MFUNCTION
+
+ void OverloadedMethod(int) {} // LF_METHODLIST
+ // LF_METHOD
+ void OverloadedMethod(int, int) {}
+
+ int HiNibble : 4; // LF_BITFIELD
+ int LoNibble : 4;
+ NestedEnum EnumVariable; // LF_MEMBER
+ static void *StaticMember; // LF_POINTER
+ // LF_STMEMBER
+};
+
+void *FooClass::StaticMember = nullptr;
+
+class Inherit : public FooClass { // LF_BCLASS
+public:
+ virtual ~Inherit() {} // LF_VTSHAPE
+ // LF_VFUNCTAB
+};
+
+class VInherit : public virtual FooClass { // LF_VBCLASS
+
+};
+
+class IVInherit : public VInherit { // LF_IVBCLASS
+};
+
+union TheUnion {
+ int X; // LF_UNION
+};
+
+int SomeArray[7] = {1, 2, 3, 4, 5, 6, 7}; // LF_ARRAY
+
+int main(int argc, char **argv) { // LF_PROCEDURE
+ const int X = 7; // LF_MODIFIER
+
+ FooStruct FooStructInstance;
+ FooClass FooClassInstance;
+ Inherit InheritInstance;
+ VInherit VInheritInstance;
+ IVInherit IVInheritInstance;
+ TheUnion UnionInstance;
+ return SomeArray[argc];
+}
diff --git a/test/DebugInfo/PDB/Inputs/every-type.pdb b/test/DebugInfo/PDB/Inputs/every-type.pdb
new file mode 100644
index 0000000000000..64996d61d3e72
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/every-type.pdb
Binary files differ
diff --git a/test/DebugInfo/PDB/Inputs/every-type.yaml b/test/DebugInfo/PDB/Inputs/every-type.yaml
new file mode 100644
index 0000000000000..8f23e8ad5e894
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/every-type.yaml
@@ -0,0 +1,272 @@
+---
+TpiStream:
+ Records:
+ # int* [Index: 0x1000]
+ - Kind: LF_POINTER
+ Pointer:
+ ReferentType: 116 # int
+ Attrs: 32778
+ # const int* [Index: 0x1001]
+ - Kind: LF_MODIFIER
+ Modifier:
+ ModifiedType: 0x1000
+ Modifiers: [ Const ]
+ # char* [Index: 0x1002]
+ - Kind: LF_POINTER
+ Pointer:
+ ReferentType: 1136 # char*
+ Attrs: 32778
+ # (int, char **) [Index: 0x1003]
+ - Kind: LF_ARGLIST
+ ArgList:
+ ArgIndicies: [ 116, 0x1002 ]
+ # (int, double) [Index: 0x1004]
+ - Kind: LF_ARGLIST
+ ArgList:
+ ArgIndicies: [ 116, 65 ] # (int, double)
+ # int main(int argc, char **argv) [Index: 0x1005]
+ - Kind: LF_PROCEDURE
+ Procedure:
+ ReturnType: 117 # int
+ CallConv: NearC # __cdecl
+ Options: [ None ]
+ ParameterCount: 2
+ ArgumentList: 0x1003 # (int, char**)
+ # <label> [Index: 0x1006]
+ - Kind: LF_LABEL
+ Label:
+ Mode: Near
+ # <forward decl>
+ # class FooClass; [Index: 0x1007]
+ - Kind: LF_STRUCTURE
+ Class:
+ MemberCount: 0
+ Options: [ None, ForwardReference ]
+ FieldList: 0
+ Name: 'FooClass'
+ DerivationList: 0
+ VTableShape: 0
+ Size: 0
+ # char* [Index: 0x1008]
+ - Kind: LF_POINTER
+ Pointer:
+ ReferentType: 0x1007 # FooClass
+ Attrs: 33802 # const
+ # int (FooClass::)(int, char **) [Index: 0x1009]
+ - Kind: LF_MFUNCTION
+ MemberFunction:
+ ReturnType: 116 # int
+ ClassType: 0x1007 # FooClass
+ ThisType: 0x1008 # const FooClass*
+ CallConv: ThisCall
+ Options: [ None ]
+ ParameterCount: 2
+ ArgumentList: 0x1003 # (int, char**)
+ ThisPointerAdjustment: 0
+ # int (FooClass::)(int, double) [Index: 0x100A]
+ - Kind: LF_MFUNCTION
+ MemberFunction:
+ ReturnType: 116 # int
+ ClassType: 0x1007 # FooClass
+ ThisType: 0x1008 # const FooClass*
+ CallConv: ThisCall
+ Options: [ None ]
+ ParameterCount: 2
+ ArgumentList: 0x1004 # (int, double)
+ ThisPointerAdjustment: 0
+ # <method overload list>
+ # int (FooClass::)(int, char **)
+ # int (FooClass::)(int, double) [Index: 0x100B]
+ - Kind: LF_METHODLIST
+ MethodOverloadList:
+ Methods:
+ - Type: 0x1009 # int (FooClass::)(int, char **)
+ Attrs: 3 # public
+ VFTableOffset: -1
+ Name: ''
+ - Type: 0x100A # int (FooClass::)(int, double)
+ Attrs: 3 # public
+ VFTableOffset: -1
+ Name: ''
+ # <Field List>
+ # A, B, C [Index: 0x100C]
+ - Kind: LF_FIELDLIST
+ FieldList:
+ - Kind: LF_ENUMERATE
+ Enumerator:
+ Attrs: 3
+ Value: 0
+ Name: A
+ - Kind: LF_ENUMERATE
+ Enumerator:
+ Attrs: 3
+ Value: 1
+ Name: B
+ - Kind: LF_ENUMERATE
+ Enumerator:
+ Attrs: 3
+ Value: 2
+ Name: C
+ # enum FooClass::Enum : uint32_t {
+ # A, B, C
+ # }; [Index: 0x100D]
+ - Kind: LF_ENUM
+ Enum:
+ NumEnumerators: 3
+ Options: [ None, Nested ]
+ FieldList: 0x100C
+ Name: 'FooClass::Enum'
+ UnderlyingType: 117
+ # <Field List>
+ # public:
+ # enum FooEnum : uint32_t {
+ # A, B, C
+ # };
+ # FooEnum EnumMember;
+ # static int StaticInt;
+ # int FooClass::OverloadedMethod(int, char **);
+ # int FooClass::OverloadedMethod(int, double);
+ # int FooClass::RegularMethod(int, double);
+ # [Index: 0x100E]
+ - Kind: LF_FIELDLIST
+ FieldList:
+ # enum FooEnum : uint32_t {
+ # A, B, C
+ # };
+ - Kind: LF_NESTTYPE
+ NestedType:
+ Type: 0x100D
+ Name: FooEnum
+ # FooEnum EnumMember;
+ - Kind: LF_MEMBER
+ DataMember:
+ Attrs: 3 # public
+ Type: 0x100D # void*
+ FieldOffset: 0
+ Name: EnumMember
+ # static int StaticInt;
+ - Kind: LF_STMEMBER
+ StaticDataMember:
+ Attrs: 3 # public
+ Type: 116 # int
+ Name: StaticInt
+ # int FooClass::OverloadedMethod(int, char **);
+ # int FooClass::OverloadedMethod(int, double);
+ - Kind: LF_METHOD
+ OverloadedMethod:
+ NumOverloads: 2
+ MethodList: 0x100B
+ Name: OverloadedMethod
+ # int FooClass::RegularMethod(int, double);
+ - Kind: LF_ONEMETHOD
+ OneMethod:
+ Type: 0x100A
+ Attrs: 3 # public
+ VFTableOffset: -1
+ Name: RegularMethod
+ # class FooClass {
+ # public:
+ # enum FooEnum : uint32_t {
+ # A, B, C
+ # };
+ # FooEnum EnumMember;
+ # static int StaticInt;
+ # int FooClass::OverloadedMethod(int, char **);
+ # int FooClass::OverloadedMethod(int, double);
+ # int FooClass::RegularMethod(int, double);
+ # }; [Index: 0x100F]
+ - Kind: LF_CLASS
+ Class:
+ MemberCount: 6
+ Options: [ None ]
+ FieldList: 0x100E
+ Name: 'FooClass'
+ DerivationList: 0
+ VTableShape: 0
+ Size: 4
+ # struct FooStructure; [Index: 0x1010]
+ - Kind: LF_STRUCTURE
+ Class:
+ MemberCount: 6
+ Options: [ None ]
+ FieldList: 0x100E
+ Name: 'FooStructure'
+ DerivationList: 0
+ VTableShape: 0
+ Size: 4
+ # interface FooInterface; [Index: 0x1011]
+ - Kind: LF_INTERFACE
+ Class:
+ MemberCount: 6
+ Options: [ None ]
+ FieldList: 0x100E
+ Name: 'FooInterface'
+ DerivationList: 0
+ VTableShape: 0
+ Size: 4
+ # <field list>
+ # : public FooClass [Index: 0x1012]
+ - Kind: LF_FIELDLIST
+ FieldList:
+ - Kind: LF_BCLASS
+ Attrs: 3 # public
+ Type: 0x100F # FooClass
+ Offset: 0
+ # <field list>
+ # : public virtual FooClass [Index: 0x1013]
+ - Kind: LF_FIELDLIST
+ FieldList:
+ - Kind: LF_VBCLASS
+ Attrs: 3 # public
+ BaseType: 0x100F # FooClass
+ VBPtrType: 0x1001 # const int *
+ VBPtrOffset: 0
+ VTableIndex: 1
+ # class Inherit : public FooClass {}; [Index: 0x1014]
+ - Kind: LF_STRUCTURE
+ Class:
+ MemberCount: 1
+ Options: [ None ]
+ FieldList: 0x100E
+ Name: 'Inherit'
+ DerivationList: 0x1012
+ VTableShape: 0
+ Size: 4
+ # class VInherit : public virtual FooClass {}; [Index: 0x1015]
+ - Kind: LF_STRUCTURE
+ Class:
+ MemberCount: 1
+ Options: [ None ]
+ FieldList: 0x100E
+ Name: 'Inherit'
+ DerivationList: 0x1012
+ VTableShape: 0
+ Size: 4
+
+# // Member type records. These are generally not length prefixed, and appear
+# // inside of a field list record.
+# MEMBER_RECORD(LF_VFUNCTAB, 0x1409, VFPtr)
+
+# MEMBER_RECORD_ALIAS(LF_BINTERFACE, 0x151a, BaseInterface, BaseClass)
+
+# MEMBER_RECORD_ALIAS(LF_IVBCLASS, 0x1402, IndirectVirtualBaseClass,
+# VirtualBaseClass)
+
+
+# TYPE_RECORD(LF_ARRAY, 0x1503, Array)
+# TYPE_RECORD(LF_UNION, 0x1506, Union)
+# TYPE_RECORD(LF_TYPESERVER2, 0x1515, TypeServer2)
+# TYPE_RECORD(LF_VFTABLE, 0x151d, VFTable)
+# TYPE_RECORD(LF_VTSHAPE, 0x000a, VFTableShape)
+
+# TYPE_RECORD(LF_BITFIELD, 0x1205, BitField)
+
+
+# // ID leaf records. Subsequent leaf types may be referenced from .debug$S.
+# TYPE_RECORD(LF_FUNC_ID, 0x1601, FuncId)
+# TYPE_RECORD(LF_MFUNC_ID, 0x1602, MemberFuncId)
+# TYPE_RECORD(LF_BUILDINFO, 0x1603, BuildInfo)
+# TYPE_RECORD(LF_SUBSTR_LIST, 0x1604, StringList)
+# TYPE_RECORD(LF_STRING_ID, 0x1605, StringId)
+# TYPE_RECORD(LF_UDT_SRC_LINE, 0x1606, UdtSourceLine)
+# TYPE_RECORD(LF_UDT_MOD_SRC_LINE, 0x1607, UdtModSourceLine)
diff --git a/test/DebugInfo/PDB/every-type.test b/test/DebugInfo/PDB/every-type.test
new file mode 100644
index 0000000000000..e6b9c15815d03
--- /dev/null
+++ b/test/DebugInfo/PDB/every-type.test
@@ -0,0 +1,261 @@
+The test input (every-type.pdb) is generated from some short and trivial C++ code
+that exercises the entire type system to generate every possible type record that
+we claim to understand. We then test this in two ways:
+ 1) We just dump the output for the purposes of readability. This tests that we
+ we can dump every possible type record.
+ 2) We dump the output to yaml, and then re-generate a PDB with the same type
+ stream, and then run test 1 on the new PDB. This verifies that the PDB
+ hasn't changed.
+
+
+RUN: llvm-pdbutil dump -type-index=0x1018,0x102A,0x103B,0x1093,0x1095,0x1096,0x1098 \
+RUN: -dependents %p/Inputs/every-type.pdb | FileCheck --check-prefix=TYPES %s
+
+RUN: llvm-pdbutil pdb2yaml -tpi-stream -ipi-stream %p/Inputs/every-type.pdb > %t.pdb.yaml
+RUN: llvm-pdbutil yaml2pdb -pdb=%t.yaml.pdb %t.pdb.yaml
+RUN: llvm-pdbutil dump -type-index=0x1018,0x102A,0x103B,0x1093,0x1095,0x1096,0x1098 \
+RUN: -dependents %t.yaml.pdb | FileCheck --check-prefix=TYPES %s
+
+TYPES: Types (TPI Stream)
+TYPES-NEXT: ============================================================
+TYPES-NEXT: Showing 7 records and their dependents (73 records total)
+TYPES-NEXT: 0x1005 | LF_MODIFIER [size = 12]
+TYPES-NEXT: referent = 0x0074 (int), modifiers = const
+TYPES-NEXT: 0x1006 | LF_CLASS [size = 48] `FooClass`
+TYPES-NEXT: unique name: `.?AVFooClass@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
+TYPES-NEXT: options: forward ref | has unique name
+TYPES-NEXT: 0x1007 | LF_VTSHAPE [size = 8]
+TYPES-NEXT: 0x1008 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1007, mode = pointer, opts = None, kind = ptr32
+TYPES-NEXT: 0x1009 | LF_CLASS [size = 44] `Inherit`
+TYPES-NEXT: unique name: `.?AVInherit@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
+TYPES-NEXT: options: forward ref | has unique name
+TYPES-NEXT: 0x100A | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1009, mode = pointer, opts = const, kind = ptr32
+TYPES-NEXT: 0x100B | LF_ARGLIST [size = 8]
+TYPES-NEXT: 0x100C | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x100D | LF_MODIFIER [size = 12]
+TYPES-NEXT: referent = 0x1009, modifiers = const
+TYPES-NEXT: 0x100E | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x100D, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x100F | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x100E: `const Inherit&`
+TYPES-NEXT: 0x1010 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x100F
+TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor
+TYPES-NEXT: 0x1011 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor
+TYPES-NEXT: 0x1012 | LF_METHODLIST [size = 20]
+TYPES-NEXT: - Method [type = 0x1010, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1011, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: 0x1013 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1009, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x1014 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x1013, # args = 1, param list = 0x100F
+TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1015 | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x0075 (unsigned): `unsigned`
+TYPES-NEXT: 0x1016 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0403 (void*), # args = 1, param list = 0x1015
+TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1017 | LF_FIELDLIST [size = 152]
+TYPES-NEXT: - LF_BCLASS
+TYPES-NEXT: type = 0x1006, offset = 4, attrs = public
+TYPES-NEXT: - LF_VFUNCTAB type = 0x1008
+TYPES-NEXT: - LF_ONEMETHOD [name = `~Inherit`]
+TYPES-NEXT: type = 0x100C, vftable offset = 0, attrs = public intro virtual
+TYPES-NEXT: - LF_METHOD [name = `Inherit`, # overloads = 2, overload list = 0x1012]
+TYPES-NEXT: - LF_ONEMETHOD [name = `operator=`]
+TYPES-NEXT: type = 0x1014, vftable offset = -1, attrs = public compiler-generated
+TYPES-NEXT: - LF_ONEMETHOD [name = `__local_vftable_ctor_closure`]
+TYPES-NEXT: type = 0x100C, vftable offset = -1, attrs = public compiler-generated
+TYPES-NEXT: - LF_ONEMETHOD [name = `__vecDelDtor`]
+TYPES-NEXT: type = 0x1016, vftable offset = 0, attrs = public intro virtual compiler-generated
+TYPES-NEXT: 0x1018 | LF_CLASS [size = 44] `Inherit`
+TYPES-NEXT: unique name: `.?AVInherit@@`
+TYPES-NEXT: vtable: 0x1007, base list: <no type>, field list: 0x1017
+TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator=
+TYPES-NEXT: 0x1019 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1005, mode = pointer, opts = None, kind = ptr32
+TYPES-NEXT: 0x101A | LF_CLASS [size = 48] `VInherit`
+TYPES-NEXT: unique name: `.?AVVInherit@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
+TYPES-NEXT: options: forward ref | has unique name
+TYPES-NEXT: 0x101B | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x101A, mode = pointer, opts = const, kind = ptr32
+TYPES-NEXT: 0x101C | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x101A, mode = rvalue ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x101D | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x101C: `VInherit&&`
+TYPES-NEXT: 0x101E | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x101D
+TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x101F | LF_MODIFIER [size = 12]
+TYPES-NEXT: referent = 0x101A, modifiers = const
+TYPES-NEXT: 0x1020 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x101F, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x1021 | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x1020: `const VInherit&`
+TYPES-NEXT: 0x1022 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1021
+TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x1023 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x1024 | LF_METHODLIST [size = 28]
+TYPES-NEXT: - Method [type = 0x101E, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1022, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1023, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: 0x1025 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x101A, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x1026 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x1025, # args = 1, param list = 0x101D
+TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1027 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x1025, # args = 1, param list = 0x1021
+TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1028 | LF_METHODLIST [size = 20]
+TYPES-NEXT: - Method [type = 0x1026, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1027, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: 0x1029 | LF_FIELDLIST [size = 60]
+TYPES-NEXT: - LF_VBCLASS
+TYPES-NEXT: base = 0x1006, vbptr = 0x1019, vbptr offset = 0, vtable index = 1
+TYPES-NEXT: attrs = public
+TYPES-NEXT: - LF_METHOD [name = `VInherit`, # overloads = 3, overload list = 0x1024]
+TYPES-NEXT: - LF_METHOD [name = `operator=`, # overloads = 2, overload list = 0x1028]
+TYPES-NEXT: 0x102A | LF_CLASS [size = 48] `VInherit`
+TYPES-NEXT: unique name: `.?AVVInherit@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1029
+TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator=
+TYPES-NEXT: 0x102B | LF_CLASS [size = 48] `IVInherit`
+TYPES-NEXT: unique name: `.?AVIVInherit@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
+TYPES-NEXT: options: forward ref | has unique name
+TYPES-NEXT: 0x102C | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x102B, mode = pointer, opts = const, kind = ptr32
+TYPES-NEXT: 0x102D | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x102B, mode = rvalue ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x102E | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x102D: `IVInherit&&`
+TYPES-NEXT: 0x102F | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x102E
+TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x1030 | LF_MODIFIER [size = 12]
+TYPES-NEXT: referent = 0x102B, modifiers = const
+TYPES-NEXT: 0x1031 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1030, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x1032 | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x1031: `const IVInherit&`
+TYPES-NEXT: 0x1033 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1032
+TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x1034 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor
+TYPES-NEXT: 0x1035 | LF_METHODLIST [size = 28]
+TYPES-NEXT: - Method [type = 0x102F, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1033, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1034, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: 0x1036 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x102B, mode = ref, opts = None, kind = ptr32
+TYPES-NEXT: 0x1037 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x1036, # args = 1, param list = 0x102E
+TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1038 | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x1036, # args = 1, param list = 0x1032
+TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x1039 | LF_METHODLIST [size = 20]
+TYPES-NEXT: - Method [type = 0x1037, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: - Method [type = 0x1038, vftable offset = -1, attrs = public compiler-generated]
+TYPES-NEXT: 0x103A | LF_FIELDLIST [size = 72]
+TYPES-NEXT: - LF_BCLASS
+TYPES-NEXT: type = 0x101A, offset = 0, attrs = public
+TYPES-NEXT: - LF_IVBCLASS
+TYPES-NEXT: base = 0x1006, vbptr = 0x1019, vbptr offset = 0, vtable index = 1
+TYPES-NEXT: attrs = public
+TYPES-NEXT: - LF_METHOD [name = `IVInherit`, # overloads = 3, overload list = 0x1035]
+TYPES-NEXT: - LF_METHOD [name = `operator=`, # overloads = 2, overload list = 0x1039]
+TYPES-NEXT: 0x103B | LF_CLASS [size = 48] `IVInherit`
+TYPES-NEXT: unique name: `.?AVIVInherit@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x103A
+TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator=
+TYPES-NEXT: 0x1087 | LF_FIELDLIST [size = 28]
+TYPES-NEXT: - LF_ENUMERATE [A = 0]
+TYPES-NEXT: - LF_ENUMERATE [B = 1]
+TYPES-NEXT: - LF_ENUMERATE [C = 2]
+TYPES-NEXT: 0x1088 | LF_ENUM [size = 64] `FooClass::NestedEnum`
+TYPES-NEXT: unique name: `.?AW4NestedEnum@FooClass@@`
+TYPES-NEXT: field list: 0x1087, underlying type: 0x0074 (int)
+TYPES-NEXT: options: has unique name | is nested
+TYPES-NEXT: 0x1089 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1006, mode = pointer, opts = const, kind = ptr32
+TYPES-NEXT: 0x108A | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x108B | LF_ARGLIST [size = 16]
+TYPES-NEXT: 0x0074 (int): `int`
+TYPES-NEXT: 0x0074 (int): `int`
+TYPES-NEXT: 0x108C | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 2, param list = 0x108B
+TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x108D | LF_ARGLIST [size = 12]
+TYPES-NEXT: 0x0074 (int): `int`
+TYPES-NEXT: 0x108E | LF_MFUNCTION [size = 28]
+TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x108D
+TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0
+TYPES-NEXT: calling conv = thiscall, options = None
+TYPES-NEXT: 0x108F | LF_METHODLIST [size = 20]
+TYPES-NEXT: - Method [type = 0x108C, vftable offset = -1, attrs = private]
+TYPES-NEXT: - Method [type = 0x108E, vftable offset = -1, attrs = private]
+TYPES-NEXT: 0x1090 | LF_BITFIELD [size = 12]
+TYPES-NEXT: type = 0x0074 (int), bit offset = 0, # bits = 4
+TYPES-NEXT: 0x1091 | LF_BITFIELD [size = 12]
+TYPES-NEXT: type = 0x0074 (int), bit offset = 4, # bits = 4
+TYPES-NEXT: 0x1092 | LF_FIELDLIST [size = 164]
+TYPES-NEXT: - LF_NESTTYPE [name = `NestedEnum`, parent = 0x1088]
+TYPES-NEXT: - LF_ONEMETHOD [name = `RegularMethod`]
+TYPES-NEXT: type = 0x108A, vftable offset = -1, attrs = private
+TYPES-NEXT: - LF_METHOD [name = `OverloadedMethod`, # overloads = 2, overload list = 0x108F]
+TYPES-NEXT: - LF_MEMBER [name = `HiNibble`, Type = 0x1090, offset = 0, attrs = private]
+TYPES-NEXT: - LF_MEMBER [name = `LoNibble`, Type = 0x1091, offset = 0, attrs = private]
+TYPES-NEXT: - LF_MEMBER [name = `EnumVariable`, Type = 0x1088, offset = 4, attrs = private]
+TYPES-NEXT: - LF_STMEMBER [name = `StaticMember`, type = 0x0403 (void*), attrs = private]
+TYPES-NEXT: 0x1093 | LF_CLASS [size = 48] `FooClass`
+TYPES-NEXT: unique name: `.?AVFooClass@@`
+TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1092
+TYPES-NEXT: options: contains nested class | has unique name
+TYPES-NEXT: 0x1094 | LF_FIELDLIST [size = 16]
+TYPES-NEXT: - LF_MEMBER [name = `X`, Type = 0x0074 (int), offset = 0, attrs = public]
+TYPES-NEXT: 0x1095 | LF_UNION [size = 40] `TheUnion`
+TYPES-NEXT: unique name: `.?ATTheUnion@@`
+TYPES-NEXT: field list: 0x1094
+TYPES-NEXT: options: has unique name | sealed
+TYPES-NEXT: 0x1096 | LF_PROCEDURE [size = 16]
+TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B
+TYPES-NEXT: calling conv = cdecl, options = None
+TYPES-NEXT: 0x1097 | LF_POINTER [size = 12]
+TYPES-NEXT: referent = 0x1096, mode = pointer, opts = const, kind = ptr32
+TYPES-NEXT: 0x1098 | LF_ARRAY [size = 16]
+TYPES-NEXT: size: 4, index type: 0x0022 (unsigned long), element type: 0x1097
diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test
index 3b7895e06b77d..1887af2e82683 100644
--- a/test/DebugInfo/PDB/pdbdump-headers.test
+++ b/test/DebugInfo/PDB/pdbdump-headers.test
@@ -67,9 +67,11 @@ ALL-NEXT: ============================================================
ALL-NEXT: Mod 0000 | Name: `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`:
ALL-NEXT: Obj: `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`:
ALL-NEXT: debug stream: 12, # files: 1, has ec info: false
+ALL-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
ALL-NEXT: Mod 0001 | Name: `* Linker *`:
ALL-NEXT: Obj: ``:
ALL-NEXT: debug stream: 14, # files: 0, has ec info: false
+ALL-NEXT: pdb file ni: 1 `{{.*empty.pdb}}`, src file ni: 0 ``
ALL: Files
ALL-NEXT: ============================================================
ALL-NEXT: Mod 0000 | `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`:
@@ -99,13 +101,11 @@ ALL-NEXT: - LF_ENUMERATE [single = 2]
ALL-NEXT: - LF_ENUMERATE [free = 3]
ALL-NEXT: - LF_ENUMERATE [neutral = 4]
ALL-NEXT: - LF_ENUMERATE [both = 5]
-ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 208239]
-ALL-NEXT: name: `__vc_attributes::threadingAttribute::threading_e`
+ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 208239] `__vc_attributes::threadingAttribute::threading_e`
ALL-NEXT: unique name: `.?AW4threading_e@threadingAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x1002, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 16377]
-ALL-NEXT: class name: `__vc_attributes::threadingAttribute`
+ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 16377] `__vc_attributes::threadingAttribute`
ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -128,8 +128,7 @@ ALL-NEXT: 0x100A | LF_FIELDLIST [size = 68, hash = 185421]
ALL-NEXT: - LF_NESTTYPE [name = `threading_e`, parent = 0x1003]
ALL-NEXT: - LF_METHOD [name = `threadingAttribute`, # overloads = 2, overload list = 0x1009]
ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x1003, offset = 0, attrs = public]
-ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 119540]
-ALL-NEXT: class name: `__vc_attributes::threadingAttribute`
+ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 119540] `__vc_attributes::threadingAttribute`
ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x100A
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -137,13 +136,11 @@ ALL-NEXT: 0x100C | LF_FIELDLIST [size = 48, hash = 261871]
ALL-NEXT: - LF_ENUMERATE [native = 0]
ALL-NEXT: - LF_ENUMERATE [com = 1]
ALL-NEXT: - LF_ENUMERATE [managed = 2]
-ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 198119]
-ALL-NEXT: name: `__vc_attributes::event_receiverAttribute::type_e`
+ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 198119] `__vc_attributes::event_receiverAttribute::type_e`
ALL-NEXT: unique name: `.?AW4type_e@event_receiverAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 48056]
-ALL-NEXT: class name: `__vc_attributes::event_receiverAttribute`
+ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 48056] `__vc_attributes::event_receiverAttribute`
ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -175,8 +172,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x100D]
ALL-NEXT: - LF_METHOD [name = `event_receiverAttribute`, # overloads = 3, overload list = 0x1015]
ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x100D, offset = 0, attrs = public]
ALL-NEXT: - LF_MEMBER [name = `layout_dependent`, Type = 0x0030 (bool), offset = 4, attrs = public]
-ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 148734]
-ALL-NEXT: class name: `__vc_attributes::event_receiverAttribute`
+ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 148734] `__vc_attributes::event_receiverAttribute`
ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1016
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -184,13 +180,11 @@ ALL-NEXT: 0x1018 | LF_FIELDLIST [size = 48, hash = 81128]
ALL-NEXT: - LF_ENUMERATE [never = 0]
ALL-NEXT: - LF_ENUMERATE [allowed = 1]
ALL-NEXT: - LF_ENUMERATE [always = 2]
-ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 60158]
-ALL-NEXT: name: `__vc_attributes::aggregatableAttribute::type_e`
+ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 60158] `__vc_attributes::aggregatableAttribute::type_e`
ALL-NEXT: unique name: `.?AW4type_e@aggregatableAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x1018, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 217249]
-ALL-NEXT: class name: `__vc_attributes::aggregatableAttribute`
+ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 217249] `__vc_attributes::aggregatableAttribute`
ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -213,26 +207,22 @@ ALL-NEXT: 0x1020 | LF_FIELDLIST [size = 68, hash = 6214]
ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1019]
ALL-NEXT: - LF_METHOD [name = `aggregatableAttribute`, # overloads = 2, overload list = 0x101F]
ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1019, offset = 0, attrs = public]
-ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 94935]
-ALL-NEXT: class name: `__vc_attributes::aggregatableAttribute`
+ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 94935] `__vc_attributes::aggregatableAttribute`
ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1020
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 151449]
-ALL-NEXT: name: `__vc_attributes::event_sourceAttribute::type_e`
+ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 151449] `__vc_attributes::event_sourceAttribute::type_e`
ALL-NEXT: unique name: `.?AW4type_e@event_sourceAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
ALL-NEXT: 0x1023 | LF_FIELDLIST [size = 28, hash = 135589]
ALL-NEXT: - LF_ENUMERATE [speed = 0]
ALL-NEXT: - LF_ENUMERATE [size = 1]
-ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 73373]
-ALL-NEXT: name: `__vc_attributes::event_sourceAttribute::optimize_e`
+ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 73373] `__vc_attributes::event_sourceAttribute::optimize_e`
ALL-NEXT: unique name: `.?AW4optimize_e@event_sourceAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x1023, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 96512]
-ALL-NEXT: class name: `__vc_attributes::event_sourceAttribute`
+ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 96512] `__vc_attributes::event_sourceAttribute`
ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -258,8 +248,7 @@ ALL-NEXT: - LF_METHOD [name = `event_sourceAttribute`, # overloads =
ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1022, offset = 0, attrs = public]
ALL-NEXT: - LF_MEMBER [name = `optimize`, Type = 0x1024, offset = 4, attrs = public]
ALL-NEXT: - LF_MEMBER [name = `decorate`, Type = 0x0030 (bool), offset = 8, attrs = public]
-ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 238560]
-ALL-NEXT: class name: `__vc_attributes::event_sourceAttribute`
+ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 238560] `__vc_attributes::event_sourceAttribute`
ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x102B
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -270,13 +259,11 @@ ALL-NEXT: - LF_ENUMERATE [service = 3]
ALL-NEXT: - LF_ENUMERATE [unspecified = 4]
ALL-NEXT: - LF_ENUMERATE [EXE = 2]
ALL-NEXT: - LF_ENUMERATE [SERVICE = 3]
-ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 115151]
-ALL-NEXT: name: `__vc_attributes::moduleAttribute::type_e`
+ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 115151] `__vc_attributes::moduleAttribute::type_e`
ALL-NEXT: unique name: `.?AW4type_e@moduleAttribute@__vc_attributes@@`
ALL-NEXT: field list: 0x102D, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 197306]
-ALL-NEXT: class name: `__vc_attributes::moduleAttribute`
+ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 197306] `__vc_attributes::moduleAttribute`
ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -338,8 +325,7 @@ ALL-NEXT: - LF_MEMBER [name = `hidden`, Type = 0x0030 (bool), offset
ALL-NEXT: - LF_MEMBER [name = `restricted`, Type = 0x0030 (bool), offset = 45, attrs = public]
ALL-NEXT: - LF_MEMBER [name = `custom`, Type = 0x1032, offset = 48, attrs = public]
ALL-NEXT: - LF_MEMBER [name = `resource_name`, Type = 0x1032, offset = 52, attrs = public]
-ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 98548]
-ALL-NEXT: class name: `__vc_attributes::moduleAttribute`
+ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 98548] `__vc_attributes::moduleAttribute`
ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1039
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -374,13 +360,11 @@ ALL-NEXT: - LF_ENUMERATE [eModuleUsage = 16777216]
ALL-NEXT: - LF_ENUMERATE [eIllegalUsage = 33554432]
ALL-NEXT: - LF_ENUMERATE [eAsynchronousUsage = 67108864]
ALL-NEXT: - LF_ENUMERATE [eAnyIDLUsage = 4161535]
-ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 171328]
-ALL-NEXT: name: `__vc_attributes::helper_attributes::usageAttribute::usage_e`
+ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 171328] `__vc_attributes::helper_attributes::usageAttribute::usage_e`
ALL-NEXT: unique name: `.?AW4usage_e@usageAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: field list: 0x103B, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 203640]
-ALL-NEXT: class name: `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 203640] `__vc_attributes::helper_attributes::usageAttribute`
ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -397,8 +381,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `usage_e`, parent = 0x103C]
ALL-NEXT: - LF_ONEMETHOD [name = `usageAttribute`]
ALL-NEXT: type = 0x1040, vftable offset = -1, attrs = public
ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x0075 (unsigned), offset = 0, attrs = public]
-ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 165040]
-ALL-NEXT: class name: `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 165040] `__vc_attributes::helper_attributes::usageAttribute`
ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1041
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -407,13 +390,11 @@ ALL-NEXT: - LF_ENUMERATE [eBoolean = 0]
ALL-NEXT: - LF_ENUMERATE [eInteger = 1]
ALL-NEXT: - LF_ENUMERATE [eFloat = 2]
ALL-NEXT: - LF_ENUMERATE [eDouble = 3]
-ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 142625]
-ALL-NEXT: name: `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
+ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 142625] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
ALL-NEXT: unique name: `.?AW4type_e@v1_alttypeAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: field list: 0x1043, underlying type: 0x0074 (int)
ALL-NEXT: options: has unique name | is nested
-ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 52534]
-ALL-NEXT: class name: `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 52534] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
ALL-NEXT: options: forward ref | has unique name
@@ -430,8 +411,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1044]
ALL-NEXT: - LF_ONEMETHOD [name = `v1_alttypeAttribute`]
ALL-NEXT: type = 0x1048, vftable offset = -1, attrs = public
ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1044, offset = 0, attrs = public]
-ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 213215]
-ALL-NEXT: class name: `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 213215] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1049
ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name
@@ -590,147 +570,195 @@ BIG-NEXT: ============================================================
BIG-NEXT: Mod 0000 | Name: `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`:
BIG-NEXT: Obj: `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`:
BIG-NEXT: debug stream: 12, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0001 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_cpu_disp_.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 14, # files: 14, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0002 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_initsect_.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 15, # files: 19, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0003 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_sehprolg4_.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 16, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 1 `f:\dd\vctools\crt\vcstartup\src\eh\i386\sehprolg4.asm`
BIG-NEXT: Mod 0004 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_chandler4gs_.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 17, # files: 14, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0005 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_secchk_.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 18, # files: 14, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0006 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_cookie.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 19, # files: 9, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0007 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_report.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 20, # files: 14, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0008 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_support.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 21, # files: 10, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0009 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\checkcfg.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 22, # files: 14, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0010 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\guard_support.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 23, # files: 10, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0011 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\loadcfg.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 24, # files: 9, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0012 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_dtor.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 25, # files: 11, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0013 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_init.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 26, # files: 10, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0014 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr_detection.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 27, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0015 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_detection.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 28, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0016 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\argv_mode.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 29, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0017 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\commit_mode.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 30, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0018 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_local_stdio_options.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 31, # files: 24, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0019 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\denormal_control.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 32, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0020 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\env_mode.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 33, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0021 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\file_mode.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 34, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0022 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\invalid_parameter_handler.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 35, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0023 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 36, # files: 2, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0024 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\new_mode.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 37, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0025 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\thread_locale.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 38, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0026 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\tncleanup.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 39, # files: 21, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0027 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\exe_main.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 40, # files: 26, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0028 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\initializers.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 41, # files: 20, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0029 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 42, # files: 20, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0030 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_stubs.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 43, # files: 1, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0031 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility_desktop.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 44, # files: 20, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0032 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_precision.obj`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`:
BIG-NEXT: debug stream: 45, # files: 20, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0033 | Name: `Import:KERNEL32.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\um\x86\kernel32.lib`:
BIG-NEXT: debug stream: 47, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0034 | Name: `KERNEL32.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\um\x86\kernel32.lib`:
BIG-NEXT: debug stream: 46, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0035 | Name: `Import:VCRUNTIME140.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\vcruntime.lib`:
BIG-NEXT: debug stream: 49, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0036 | Name: `VCRUNTIME140.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\vcruntime.lib`:
BIG-NEXT: debug stream: 48, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0037 | Name: `Import:api-ms-win-crt-stdio-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 59, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0038 | Name: `api-ms-win-crt-stdio-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 58, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0039 | Name: `Import:api-ms-win-crt-runtime-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 57, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0040 | Name: `api-ms-win-crt-runtime-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 56, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0041 | Name: `Import:api-ms-win-crt-math-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 55, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0042 | Name: `api-ms-win-crt-math-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 54, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0043 | Name: `Import:api-ms-win-crt-locale-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 53, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0044 | Name: `api-ms-win-crt-locale-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 52, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0045 | Name: `Import:api-ms-win-crt-heap-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 51, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0046 | Name: `api-ms-win-crt-heap-l1-1-0.dll`:
BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`:
BIG-NEXT: debug stream: 50, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 ``
BIG-NEXT: Mod 0047 | Name: `* Linker *`:
BIG-NEXT: Obj: ``:
BIG-NEXT: debug stream: 60, # files: 0, has ec info: false
+BIG-NEXT: pdb file ni: 55 `{{.*test.pdb}}`, src file ni: 0 ``
BIG: Files
BIG-NEXT: ============================================================
BIG-NEXT: Mod 0000 | `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`:
diff --git a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
index 3903c07b027fb..dd4c072fe0c94 100644
--- a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
+++ b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
@@ -14,8 +14,7 @@ TPI-TYPES-NEXT: - LF_MEMBER [name = `FooMember`, Type = 0x0403 (void*
TPI-TYPES-NEXT: 0x1002 | LF_ARGLIST [size = 16]
TPI-TYPES-NEXT: 0x0074 (int): `int`
TPI-TYPES-NEXT: 0x1000: `char**`
-TPI-TYPES-NEXT: 0x1003 | LF_STRUCTURE [size = 36]
-TPI-TYPES-NEXT: class name: `FooBar`
+TPI-TYPES-NEXT: 0x1003 | LF_STRUCTURE [size = 36] `FooBar`
TPI-TYPES-NEXT: unique name: `FooBar`
TPI-TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1001
TPI-TYPES-NEXT: options: has unique name
diff --git a/test/DebugInfo/PDB/pdbdump-mergetypes.test b/test/DebugInfo/PDB/pdbdump-mergetypes.test
index 8ab64cfab5163..60cf4a172aa2a 100644
--- a/test/DebugInfo/PDB/pdbdump-mergetypes.test
+++ b/test/DebugInfo/PDB/pdbdump-mergetypes.test
@@ -11,8 +11,7 @@ MERGED-NEXT: 0x1000 | LF_POINTER [size = 12]
MERGED-NEXT: referent = 0x0075 (unsigned), mode = pointer, opts = None, kind = ptr32
MERGED-NEXT: 0x1001 | LF_POINTER [size = 12]
MERGED-NEXT: referent = 0x0076 (__int64), mode = pointer, opts = None, kind = ptr32
-MERGED-NEXT: 0x1002 | LF_STRUCTURE [size = 48]
-MERGED-NEXT: class name: `OnlyInMerge1`
+MERGED-NEXT: 0x1002 | LF_STRUCTURE [size = 48] `OnlyInMerge1`
MERGED-NEXT: unique name: `OnlyInMerge1`
MERGED-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
MERGED-NEXT: options: forward ref | has unique name
@@ -29,8 +28,7 @@ MERGED-NEXT: 0x1003: `unsigned**`
MERGED-NEXT: 0x1007 | LF_PROCEDURE [size = 16]
MERGED-NEXT: return type = 0x0075 (unsigned), # args = 0, param list = 0x1006
MERGED-NEXT: calling conv = cdecl, options = None
-MERGED-NEXT: 0x1008 | LF_STRUCTURE [size = 48]
-MERGED-NEXT: class name: `OnlyInMerge2`
+MERGED-NEXT: 0x1008 | LF_STRUCTURE [size = 48] `OnlyInMerge2`
MERGED-NEXT: unique name: `OnlyInMerge2`
MERGED-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
MERGED-NEXT: options: forward ref | has unique name
diff --git a/test/DebugInfo/X86/dbg-declare-inalloca.ll b/test/DebugInfo/X86/dbg-declare-inalloca.ll
index e3f5c7e629b87..e8a310856c104 100644
--- a/test/DebugInfo/X86/dbg-declare-inalloca.ll
+++ b/test/DebugInfo/X86/dbg-declare-inalloca.ll
@@ -55,41 +55,41 @@
; CHECK: .asciz "c"
; CHECK: .cv_def_range [[start]] [[end]]
-; OBJ-LABEL: ProcStart {
+; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
; OBJ: Kind: S_GPROC32_ID (0x1147)
; OBJ: DisplayName: f
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: NonTrivial (0x1007)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: a
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 21
; OBJ: BasePointerOffset: 12
; OBJ: }
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: b
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 21
; OBJ: BasePointerOffset: 16
; OBJ: }
; FIXME: Retain unused.
-; OBJ: Local {
+; OBJ: LocalSym {
; OBJ: Type: int (0x74)
; OBJ: Flags [ (0x1)
; OBJ: IsParameter (0x1)
; OBJ: ]
; OBJ: VarName: c
; OBJ: }
-; OBJ: DefRangeRegisterRel {
+; OBJ: DefRangeRegisterRelSym {
; OBJ: BaseRegister: 21
; OBJ: BasePointerOffset: 24
; OBJ: }
diff --git a/test/DebugInfo/dwarfdump-str-offsets.test b/test/DebugInfo/dwarfdump-str-offsets.test
index 0465357ba32a6..c09135580fe62 100644
--- a/test/DebugInfo/dwarfdump-str-offsets.test
+++ b/test/DebugInfo/dwarfdump-str-offsets.test
@@ -1,92 +1,94 @@
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets.x86_64.o | FileCheck %s
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets.x86_64.o | FileCheck --check-prefix=COMMON \
+RUN: --check-prefix=SPLIT %s
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets-macho.o | FileCheck --check-prefix=COMMON %s
; We are using a hand-constructed object file and are interest in the correct
; diplay of the DW_str_offsetsbase attribute, the correct display of strings
; and the dump of the .debug_str_offsets[.dwo] table.
;
; Abbreviation for DW_AT_str_offsets_base
-CHECK: .debug_abbrev contents:
-CHECK-NOT: contents:
-CHECK: DW_TAG_compile_unit
-CHECK-NOT: DW_TAG
-CHECK: DW_AT_str_offsets_base DW_FORM_sec_offset
+COMMON: .debug_abbrev contents:
+COMMON-NOT: contents:
+COMMON: DW_TAG_compile_unit
+COMMON-NOT: DW_TAG
+COMMON: DW_AT_str_offsets_base DW_FORM_sec_offset
; Verify that strings are displayed correctly as indexed strings
-CHECK: .debug_info contents:
-CHECK-NOT: contents:
-CHECK: DW_TAG_compile_unit
-CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1")
-CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
-CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1")
-CHECK-NOT: NULL
-CHECK: DW_TAG_subprogram
-CHECK-NEXT: DW_AT_name [DW_FORM_strx1] ( indexed (00000003) string = "MyFunc")
-CHECK-NOT: NULL
-CHECK: DW_TAG_variable
-CHECK-NEXT: DW_AT_name [DW_FORM_strx2] ( indexed (00000004) string = "MyVar1")
-CHECK-NOT: NULL
-CHECK: DW_TAG_variable
-CHECK-NEXT: DW_AT_name [DW_FORM_strx3] ( indexed (00000005) string = "MyVar2")
-CHECK-NOT: NULL
-CHECK: DW_TAG_variable
-CHECK-NEXT: DW_AT_name [DW_FORM_strx4] ( indexed (00000006) string = "MyVar3")
+COMMON: .debug_info contents:
+COMMON-NOT: contents:
+COMMON: DW_TAG_compile_unit
+COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
+COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1")
+COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
+COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1")
+COMMON-NOT: NULL
+COMMON: DW_TAG_subprogram
+COMMON-NEXT: DW_AT_name [DW_FORM_strx1] ( indexed (00000003) string = "MyFunc")
+COMMON-NOT: NULL
+COMMON: DW_TAG_variable
+COMMON-NEXT: DW_AT_name [DW_FORM_strx2] ( indexed (00000004) string = "MyVar1")
+COMMON-NOT: NULL
+COMMON: DW_TAG_variable
+COMMON-NEXT: DW_AT_name [DW_FORM_strx3] ( indexed (00000005) string = "MyVar2")
+COMMON-NOT: NULL
+COMMON: DW_TAG_variable
+COMMON-NEXT: DW_AT_name [DW_FORM_strx4] ( indexed (00000006) string = "MyVar3")
; Second compile unit (b.cpp)
-CHECK: DW_TAG_compile_unit
-CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
-CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
+COMMON: DW_TAG_compile_unit
+COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
+COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
+COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
+COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
; The split CU
-CHECK: .debug_info.dwo contents:
-CHECK-NOT: contents:
-CHECK: DW_TAG_compile_unit
-CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer")
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit")
-CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
-CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU")
+SPLIT: .debug_info.dwo contents:
+SPLIT-NOT: contents:
+SPLIT: DW_TAG_compile_unit
+SPLIT-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer")
+SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit")
+SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
+SPLIT-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU")
; The type unit
-CHECK: .debug_types contents:
-CHECK: DW_TAG_type_unit
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
-CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040)
-CHECK: DW_TAG_structure_type
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
+COMMON: .debug_types contents:
+COMMON: DW_TAG_type_unit
+COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
+COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040)
+COMMON: DW_TAG_structure_type
+COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
; The split type unit
-CHECK: .debug_types.dwo contents:
-CHECK: DW_TAG_type_unit
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit")
-CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
-CHECK: DW_TAG_structure_type
-CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct")
+SPLIT: .debug_types.dwo contents:
+SPLIT: DW_TAG_type_unit
+SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit")
+SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+SPLIT: DW_TAG_structure_type
+SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct")
; The .debug_str_offsets section
-CHECK: .debug_str_offsets contents:
-CHECK-NEXT: 0x00000000: Contribution size = 28, Version = 5
-CHECK-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
-CHECK-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
-CHECK-NEXT: 0x00000010: 00000027 "/home/test/CU1"
-CHECK-NEXT: 0x00000014: 00000067 "MyFunc"
-CHECK-NEXT: 0x00000018: 0000006e "MyVar1"
-CHECK-NEXT: 0x0000001c: 00000075 "MyVar2"
-CHECK-NEXT: 0x00000020: 0000007c "MyVar3"
-CHECK-NEXT: 0x00000024: Contribution size = 12, Version = 5
-CHECK-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
-CHECK-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
-CHECK-NEXT: 0x00000034: 00000045 "/home/test/CU2"
-CHECK-NEXT: 0x00000038: Contribution size = 8, Version = 5
-CHECK-NEXT: 0x00000040: 00000054 "Type_Unit"
-CHECK-NEXT: 0x00000044: 0000005e "MyStruct"
+COMMON: .debug_str_offsets contents:
+COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5
+COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
+COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
+COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1"
+COMMON-NEXT: 0x00000014: 00000067 "MyFunc"
+COMMON-NEXT: 0x00000018: 0000006e "MyVar1"
+COMMON-NEXT: 0x0000001c: 00000075 "MyVar2"
+COMMON-NEXT: 0x00000020: 0000007c "MyVar3"
+COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5
+COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
+COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
+COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2"
+COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5
+COMMON-NEXT: 0x00000040: 00000054 "Type_Unit"
+COMMON-NEXT: 0x00000044: 0000005e "MyStruct"
-CHECK: .debug_str_offsets.dwo contents:
-CHECK-NEXT: 0x00000000: Contribution size = 12, Version = 5
-CHECK-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer"
-CHECK-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit"
-CHECK-NEXT: 0x00000010: 00000034 "/home/test/splitCU"
-CHECK-NEXT: 0x00000014: Contribution size = 8, Version = 5
-CHECK-NEXT: 0x0000001c: 00000047 "V5_split_type_unit"
-CHECK-NEXT: 0x00000020: 0000005a "V5_split_Mystruct"
+SPLIT: .debug_str_offsets.dwo contents:
+SPLIT-NEXT: 0x00000000: Contribution size = 12, Version = 5
+SPLIT-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer"
+SPLIT-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit"
+SPLIT-NEXT: 0x00000010: 00000034 "/home/test/splitCU"
+SPLIT-NEXT: 0x00000014: Contribution size = 8, Version = 5
+SPLIT-NEXT: 0x0000001c: 00000047 "V5_split_type_unit"
+SPLIT-NEXT: 0x00000020: 0000005a "V5_split_Mystruct"
diff --git a/test/DebugInfo/invalid-relocations.test b/test/DebugInfo/invalid-relocations.test
new file mode 100644
index 0000000000000..2252e1a205c3d
--- /dev/null
+++ b/test/DebugInfo/invalid-relocations.test
@@ -0,0 +1,35 @@
+# RUN: yaml2obj %s > %t.o
+# RUN: llvm-dwarfdump %t.o 2>&1 | FileCheck %s
+# CHECK: failed to compute relocation: Unknown
+
+!ELF
+FileHeader:
+ Class: ELFCLASS32
+ Data: ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_386
+Sections:
+ - Type: SHT_PROGBITS
+ Name: .text
+ Flags: [ ]
+ AddressAlign: 0x04
+ Content: "0000"
+ - Type: SHT_PROGBITS
+ Name: .debug_info
+ Flags: [ ]
+ AddressAlign: 0x04
+ Content: "0000"
+ - Type: SHT_REL
+ Name: .rel.debug_info
+ Link: .symtab
+ Info: .debug_info
+ Relocations:
+ - Offset: 0
+ Symbol: _start
+ Type: 0xFF
+Symbols:
+ Global:
+ - Name: _start
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x0
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 2c64804659fce..bcad37cf9a489 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -10,9 +10,10 @@ RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x8dc" >> %t.input
RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0xa05" >> %t.input
RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x987" >> %t.input
RUN: echo "%p/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 0x568" >> %t.input
-RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input
-RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input
-RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x62d" >> %t.input
+RUN: cp "%p/Inputs/dwarfdump-test3.elf-x86-64-space" "%T/dwarfdump-test3.elf-x86-64 space"
+RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input
+RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input
+RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x62d" >> %t.input
RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input
RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input
RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input
diff --git a/test/Instrumentation/MemorySanitizer/unsized_type.ll b/test/Instrumentation/MemorySanitizer/unsized_type.ll
new file mode 100644
index 0000000000000..94ae92d3354a4
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/unsized_type.ll
@@ -0,0 +1,22 @@
+; Check that unsized token types used by coroutine intrinsics do not cause
+; assertion failures.
+; RUN: opt < %s -msan -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*)
+declare i1 @llvm.coro.alloc(token)
+
+define void @foo() sanitize_memory {
+entry:
+ %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+ %dyn.alloc.reqd = call i1 @llvm.coro.alloc(token %id)
+ ret void
+}
+
+; CHECK: define void @foo
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %id = call token @llvm.coro.id
+; CHECK-NEXT: call i1 @llvm.coro.alloc(token %id)
+; CHECK-NEXT: ret void
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
index 30c58fea4cb7e..3d83d9299e664 100644
--- a/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -1959,7 +1959,7 @@ entry:
define void @atomic_signal_fence_acquire() nounwind uwtable {
entry:
- fence singlethread acquire, !dbg !7
+ fence syncscope("singlethread") acquire, !dbg !7
ret void, !dbg !7
}
; CHECK-LABEL: atomic_signal_fence_acquire
@@ -1975,7 +1975,7 @@ entry:
define void @atomic_signal_fence_release() nounwind uwtable {
entry:
- fence singlethread release, !dbg !7
+ fence syncscope("singlethread") release, !dbg !7
ret void, !dbg !7
}
; CHECK-LABEL: atomic_signal_fence_release
@@ -1991,7 +1991,7 @@ entry:
define void @atomic_signal_fence_acq_rel() nounwind uwtable {
entry:
- fence singlethread acq_rel, !dbg !7
+ fence syncscope("singlethread") acq_rel, !dbg !7
ret void, !dbg !7
}
; CHECK-LABEL: atomic_signal_fence_acq_rel
@@ -2007,7 +2007,7 @@ entry:
define void @atomic_signal_fence_seq_cst() nounwind uwtable {
entry:
- fence singlethread seq_cst, !dbg !7
+ fence syncscope("singlethread") seq_cst, !dbg !7
ret void, !dbg !7
}
; CHECK-LABEL: atomic_signal_fence_seq_cst
diff --git a/test/LTO/Resolution/X86/linker-redef-thin.ll b/test/LTO/Resolution/X86/linker-redef-thin.ll
new file mode 100644
index 0000000000000..ebaac8094e75a
--- /dev/null
+++ b/test/LTO/Resolution/X86/linker-redef-thin.ll
@@ -0,0 +1,16 @@
+; RUN: opt -module-summary %s -o %t.o
+; RUN: llvm-lto2 run -o %t1.o %t.o -r %t.o,patatino,pr
+; RUN: llvm-readobj -t %t1.o.0 | FileCheck %s
+
+; CHECK: Name: patatino
+; CHECK-NEXT: Value:
+; CHECK-NEXT: Size:
+; CHECK-NEXT: Binding: Weak
+; CHECK-NEXT: Type: Function
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @patatino() {
+ ret void
+}
diff --git a/test/Linker/Inputs/syncscope-1.ll b/test/Linker/Inputs/syncscope-1.ll
new file mode 100644
index 0000000000000..90578e931dd53
--- /dev/null
+++ b/test/Linker/Inputs/syncscope-1.ll
@@ -0,0 +1,6 @@
+define void @syncscope_1() {
+ fence syncscope("agent") seq_cst
+ fence syncscope("workgroup") seq_cst
+ fence syncscope("wavefront") seq_cst
+ ret void
+}
diff --git a/test/Linker/Inputs/syncscope-2.ll b/test/Linker/Inputs/syncscope-2.ll
new file mode 100644
index 0000000000000..527c5bf93d005
--- /dev/null
+++ b/test/Linker/Inputs/syncscope-2.ll
@@ -0,0 +1,6 @@
+define void @syncscope_2() {
+ fence syncscope("image") seq_cst
+ fence syncscope("agent") seq_cst
+ fence syncscope("workgroup") seq_cst
+ ret void
+}
diff --git a/test/Linker/Inputs/thumb-module-inline-asm.ll b/test/Linker/Inputs/thumb-module-inline-asm.ll
new file mode 100644
index 0000000000000..7792ff96d5b57
--- /dev/null
+++ b/test/Linker/Inputs/thumb-module-inline-asm.ll
@@ -0,0 +1,3 @@
+target triple = "thumbv7-linux-gnueabihf"
+
+module asm "orn r1, r2, r2"
diff --git a/test/Linker/link-arm-and-thumb-module-inline-asm.ll b/test/Linker/link-arm-and-thumb-module-inline-asm.ll
new file mode 100644
index 0000000000000..13779f37ffa0e
--- /dev/null
+++ b/test/Linker/link-arm-and-thumb-module-inline-asm.ll
@@ -0,0 +1,20 @@
+; This test checks that proper directives to switch between ARM and Thumb mode
+; are added when linking ARM and Thumb modules.
+
+; RUN: llvm-as %s -o %t1.bc
+; RUN: llvm-as %p/Inputs/thumb-module-inline-asm.ll -o %t2.bc
+; RUN: llvm-link %t1.bc %t2.bc -S 2> %t3.out | FileCheck %s
+
+target triple = "armv7-linux-gnueabihf"
+
+module asm "add r1, r2, r2"
+
+; CHECK: .text
+; CHECK-NEXT: .balign 4
+; CHECK-NEXT: .arm
+; CHECK-NEXT: add r1, r2, r2
+; CHECK-NEXT: module asm
+; CHECK-NEXT: .text
+; CHECK-NEXT: .balign 2
+; CHECK-NEXT: .thumb
+; CHECK-NEXT: orn r1, r2, r2
diff --git a/test/Linker/syncscopes.ll b/test/Linker/syncscopes.ll
new file mode 100644
index 0000000000000..a572c23cffbdb
--- /dev/null
+++ b/test/Linker/syncscopes.ll
@@ -0,0 +1,11 @@
+; RUN: llvm-link %S/Inputs/syncscope-1.ll %S/Inputs/syncscope-2.ll -S | FileCheck %s
+
+; CHECK-LABEL: define void @syncscope_1
+; CHECK: fence syncscope("agent") seq_cst
+; CHECK: fence syncscope("workgroup") seq_cst
+; CHECK: fence syncscope("wavefront") seq_cst
+
+; CHECK-LABEL: define void @syncscope_2
+; CHECK: fence syncscope("image") seq_cst
+; CHECK: fence syncscope("agent") seq_cst
+; CHECK: fence syncscope("workgroup") seq_cst
diff --git a/test/MC/AArch64/label-arithmetic-diags-elf.s b/test/MC/AArch64/label-arithmetic-diags-elf.s
index dbfdd24f8dc91..2ef67fafb2ea5 100644
--- a/test/MC/AArch64/label-arithmetic-diags-elf.s
+++ b/test/MC/AArch64/label-arithmetic-diags-elf.s
@@ -5,7 +5,7 @@ b:
.fill 300
e:
.byte e - b
- // CHECK: error: value evaluated as 300 is out of range.
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: value evaluated as 300 is out of range.
// CHECK-NEXT: .byte e - b
// CHECK-NEXT: ^
@@ -14,67 +14,74 @@ start:
.space 5000
end:
add w0, w1, #(end - start)
- cmp w0, #(end - start)
- // CHECK: error: fixup value out of range
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range
// CHECK-NEXT: add w0, w1, #(end - start)
// CHECK-NEXT: ^
- // CHECK: error: fixup value out of range
+
+ cmp w0, #(end - start)
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range
// CHECK-NEXT: cmp w0, #(end - start)
// CHECK-NEXT: ^
negative:
add w0, w1, #(end - negative)
- cmp w0, #(end - negative)
- // CHECK: error: fixup value out of range
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range
// CHECK-NEXT: add w0, w1, #(end - negative)
// CHECK-NEXT: ^
- // CHECK: error: fixup value out of range
+
+ cmp w0, #(end - negative)
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range
// CHECK-NEXT: cmp w0, #(end - negative)
// CHECK-NEXT: ^
add w0, w1, #(end - external)
- cmp w0, #(end - external)
- // CHECK: error: symbol 'external' can not be undefined in a subtraction expression
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: symbol 'external' can not be undefined in a subtraction expression
// CHECK-NEXT: add w0, w1, #(end - external)
// CHECK-NEXT: ^
- // CHECK: error: symbol 'external' can not be undefined in a subtraction expression
+
+ cmp w0, #(end - external)
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: symbol 'external' can not be undefined in a subtraction expression
// CHECK-NEXT: cmp w0, #(end - external)
// CHECK-NEXT: ^
add w0, w1, #:lo12:external - end
- cmp w0, #:lo12:external - end
- // CHECK: error: Unsupported pc-relative fixup kind
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind
// CHECK-NEXT: add w0, w1, #:lo12:external - end
// CHECK-NEXT: ^
- // CHECK: error: Unsupported pc-relative fixup kind
+
+ cmp w0, #:lo12:external - end
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind
// CHECK-NEXT: cmp w0, #:lo12:external - end
// CHECK-NEXT: ^
add w0, w1, #:got_lo12:external - end
- cmp w0, #:got_lo12:external - end
- // CHECK: error: Unsupported pc-relative fixup kind
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind
// CHECK-NEXT: add w0, w1, #:got_lo12:external - end
// CHECK-NEXT: ^
- // CHECK: error: Unsupported pc-relative fixup kind
+
+ cmp w0, #:got_lo12:external - end
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind
// CHECK-NEXT: cmp w0, #:got_lo12:external - end
// CHECK-NEXT: ^
.section sec_y
end_across_sec:
add w0, w1, #(end_across_sec - start)
- cmp w0, #(end_across_sec - start)
- // CHECK: error: Cannot represent a difference across sections
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections
// CHECK-NEXT: add w0, w1, #(end_across_sec - start)
// CHECK-NEXT: ^
- // CHECK: error: Cannot represent a difference across sections
+
+ cmp w0, #(end_across_sec - start)
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections
// CHECK-NEXT: cmp w0, #(end_across_sec - start)
// CHECK-NEXT: ^
add w0, w1, #(sec_y - sec_x)
- cmp w0, #(sec_y - sec_x)
- // CHECK: error: Cannot represent a difference across sections
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections
// CHECK-NEXT: add w0, w1, #(sec_y - sec_x)
// CHECK-NEXT: ^
- // CHECK: error: Cannot represent a difference across sections
+
+ cmp w0, #(sec_y - sec_x)
+ // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections
// CHECK-NEXT: cmp w0, #(sec_y - sec_x)
// CHECK-NEXT: ^
diff --git a/test/MC/AMDGPU/gfx9_asm_all.s b/test/MC/AMDGPU/gfx9_asm_all.s
index 0c3dbd221a49e..56484a37bdcea 100644
--- a/test/MC/AMDGPU/gfx9_asm_all.s
+++ b/test/MC/AMDGPU/gfx9_asm_all.s
@@ -104933,3 +104933,462 @@ v_cmpx_t_u32_sdwa s[6:7], v1, v2 src0_sel:DWORD src1_sel:WORD_1
v_cmpx_t_u32_sdwa s[6:7], v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x86,0x06,0x0e]
+
+v_mad_mix_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_mad_mix_f32 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_mad_mix_f32 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x18]
+
+v_mad_mix_f32 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x96,0x19]
+
+v_mad_mix_f32 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x9a,0x19]
+
+v_mad_mix_f32 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x9e,0x19]
+
+v_mad_mix_f32 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xaa,0x19]
+
+v_mad_mix_f32 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xae,0x19]
+
+v_mad_mix_f32 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xf2,0x19]
+
+v_mad_mix_f32 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfa,0x19]
+
+v_mad_mix_f32 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfe,0x19]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14]
+
+v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v5, -v1, v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x3c]
+
+v_mad_mix_f32 v5, v1, -v2, v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x5c]
+
+v_mad_mix_f32 v5, v1, v2, -v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x9c]
+
+v_mad_mix_f32 v5, -v1, -v2, -v3
+// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0xfc]
+
+v_mad_mix_f32 v5, |v1|, v2, v3
+// CHECK: [0x05,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, |v2|, v3
+// CHECK: [0x05,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, |v3|
+// CHECK: [0x05,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, |v1|, |v2|, |v3|
+// CHECK: [0x05,0x47,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_mad_mixhi_f16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_mad_mixhi_f16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x18]
+
+v_mad_mixhi_f16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x96,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x9a,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x9e,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xaa,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xae,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xf2,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfa,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfe,0x19]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x14]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixhi_f16 v5, -v1, v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x3c]
+
+v_mad_mixhi_f16 v5, v1, -v2, v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x5c]
+
+v_mad_mixhi_f16 v5, v1, v2, -v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x9c]
+
+v_mad_mixhi_f16 v5, -v1, -v2, -v3
+// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0xfc]
+
+v_mad_mixhi_f16 v5, |v1|, v2, v3
+// CHECK: [0x05,0x41,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, |v2|, v3
+// CHECK: [0x05,0x42,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, |v3|
+// CHECK: [0x05,0x44,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, |v1|, |v2|, |v3|
+// CHECK: [0x05,0x47,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixhi_f16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_mad_mixlo_f16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_mad_mixlo_f16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x18]
+
+v_mad_mixlo_f16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x96,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x9a,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x9e,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xaa,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xae,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xf2,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfa,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfe,0x19]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x14]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixlo_f16 v5, -v1, v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x3c]
+
+v_mad_mixlo_f16 v5, v1, -v2, v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x5c]
+
+v_mad_mixlo_f16 v5, v1, v2, -v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x9c]
+
+v_mad_mixlo_f16 v5, -v1, -v2, -v3
+// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0xfc]
+
+v_mad_mixlo_f16 v5, |v1|, v2, v3
+// CHECK: [0x05,0x41,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, |v2|, v3
+// CHECK: [0x05,0x42,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, |v3|
+// CHECK: [0x05,0x44,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, |v1|, |v2|, |v3|
+// CHECK: [0x05,0x47,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mixlo_f16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s
index f4b1a3da714f8..bc6f6100f327d 100644
--- a/test/MC/AMDGPU/vop3p-err.s
+++ b/test/MC/AMDGPU/vop3p-err.s
@@ -71,47 +71,6 @@ v_pk_add_u16 v1, abs(v2), v3
// GFX9: :19: error: invalid operand for instruction
v_pk_add_u16 v1, -v2, v3
-
-//
-// Packed operands on the non-packed VOP3P instructions
-//
-
-// GFX9: invalid operand for instruction
-v_mad_mix_f32 v1, v2, v3, v4 op_sel:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mix_f32 v1, v2, v3, v4 op_sel_hi:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mix_f32 v1, v2, v3, v4 neg_lo:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mix_f32 v1, v2, v3, v4 neg_hi:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixlo_f16 v1, v2, v3, v4 op_sel:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixlo_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixlo_f16 v1, v2, v3, v4 neg_lo:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixlo_f16 v1, v2, v3, v4 neg_hi:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixhi_f16 v1, v2, v3, v4 op_sel:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixhi_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixhi_f16 v1, v2, v3, v4 neg_lo:[0,0,0]
-
-// GFX9: invalid operand for instruction
-v_mad_mixhi_f16 v1, v2, v3, v4 neg_hi:[0,0,0]
-
//
// Constant bus restrictions
//
diff --git a/test/MC/AMDGPU/vop3p.s b/test/MC/AMDGPU/vop3p.s
index c9eda69e13d2a..97c3650cdf54f 100644
--- a/test/MC/AMDGPU/vop3p.s
+++ b/test/MC/AMDGPU/vop3p.s
@@ -169,48 +169,81 @@ v_pk_max_f16 v0, v1, v2
// GFX9: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x92,0xd3,0x01,0x05,0x02,0x18]
v_mad_mix_f32 v0, v1, v2, v3
-// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
v_mad_mixlo_f16 v0, v1, v2, v3
-// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04]
+// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
v_mad_mixhi_f16 v0, v1, v2, v3
-// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04]
-
+// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c]
//
// Regular source modifiers on non-packed instructions
//
v_mad_mix_f32 v0, abs(v1), v2, v3
-// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
v_mad_mix_f32 v0, v1, abs(v2), v3
-// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
v_mad_mix_f32 v0, v1, v2, abs(v3)
-// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
v_mad_mix_f32 v0, -v1, v2, v3
-// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24]
+// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x3c]
v_mad_mix_f32 v0, v1, -v2, v3
-// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44]
+// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x5c]
v_mad_mix_f32 v0, v1, v2, -v3
-// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84]
+// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x9c]
v_mad_mix_f32 v0, -abs(v1), v2, v3
-// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24]
+// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x3c]
v_mad_mix_f32 v0, v1, -abs(v2), v3
-// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44]
+// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x5c]
v_mad_mix_f32 v0, v1, v2, -abs(v3)
-// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84]
+// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x9c]
v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3)
-// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44]
+// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x45,0xa1,0xd3,0x01,0x05,0x0e,0x5c]
v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3)
-// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4]
+// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x46,0xa2,0xd3,0x01,0x05,0x0e,0xbc]
+
+//
+// op_sel with non-packed instructions
+//
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x00,0x48,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x00,0x50,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x60,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x00,0x78,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1]
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c]
diff --git a/test/MC/ARM/elf-movt.s b/test/MC/ARM/elf-movt.s
index 9df7a603b71a7..858e4aa41b29c 100644
--- a/test/MC/ARM/elf-movt.s
+++ b/test/MC/ARM/elf-movt.s
@@ -14,8 +14,20 @@ barf: @ @barf
movw r0, :lower16:GOT-(.LPC0_2+8)
movt r0, :upper16:GOT-(.LPC0_2+8)
.LPC0_2:
+ movw r0, :lower16:extern_symbol+1234
+ movt r0, :upper16:extern_symbol+1234
+
+ movw r0, :lower16:(foo - bar + 1234)
+ movt r0, :upper16:(foo - bar + 1234)
+foo:
+bar:
+
@ ASM: movw r0, :lower16:(GOT-(.LPC0_2+8))
@ ASM-NEXT: movt r0, :upper16:(GOT-(.LPC0_2+8))
+@ ASM: movw r0, :lower16:(extern_symbol+1234)
+@ ASM-NEXT: movt r0, :upper16:(extern_symbol+1234)
+@ ASM: movw r0, :lower16:((foo-bar)+1234)
+@ ASM-NEXT: movt r0, :upper16:((foo-bar)+1234)
@OBJ: Disassembly of section .text:
@OBJ-NEXT: barf:
@@ -23,6 +35,12 @@ barf: @ @barf
@OBJ-NEXT: 00000000: R_ARM_MOVW_PREL_NC GOT
@OBJ-NEXT: 4: f4 0f 4f e3 movt r0, #65524
@OBJ-NEXT: 00000004: R_ARM_MOVT_PREL GOT
+@OBJ-NEXT: 8: d2 04 00 e3 movw r0, #1234
+@OBJ-NEXT: 00000008: R_ARM_MOVW_ABS_NC extern_symbol
+@OBJ-NEXT: c: d2 04 40 e3 movt r0, #1234
+@OBJ-NEXT: 0000000c: R_ARM_MOVT_ABS extern_symbol
+@OBJ-NEXT: 10: d2 04 00 e3 movw r0, #1234
+@OBJ-NEXT: 14: 00 00 40 e3 movt r0, #0
@THUMB: Disassembly of section .text:
@THUMB-NEXT: barf:
@@ -30,3 +48,9 @@ barf: @ @barf
@THUMB-NEXT: 00000000: R_ARM_THM_MOVW_PREL_NC GOT
@THUMB-NEXT: 4: cf f6 f4 70 movt r0, #65524
@THUMB-NEXT: 00000004: R_ARM_THM_MOVT_PREL GOT
+@THUMB-NEXT: 8: 40 f2 d2 40 movw r0, #1234
+@THUMB-NEXT: 00000008: R_ARM_THM_MOVW_ABS_NC extern_symbol
+@THUMB-NEXT: c: c0 f2 d2 40 movt r0, #1234
+@THUMB-NEXT: 0000000c: R_ARM_THM_MOVT_ABS extern_symbol
+@THUMB-NEXT: 10: 40 f2 d2 40 movw r0, #1234
+@THUMB-NEXT: 14: c0 f2 00 00 movt r0, #0
diff --git a/test/MC/ARM/invalid-instructions-spellcheck.s b/test/MC/ARM/invalid-instructions-spellcheck.s
new file mode 100644
index 0000000000000..ca118cff6ddf2
--- /dev/null
+++ b/test/MC/ARM/invalid-instructions-spellcheck.s
@@ -0,0 +1,68 @@
+@ RUN: not llvm-mc -triple=arm -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple=thumb -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMB
+
+@ This tests the mnemonic spell checker.
+
+@ First check what happens when an instruction is omitted:
+
+ r1, r2, r3
+
+@ CHECK: error: unexpected token in operand
+@ CHECK-NEXT: r1, r2, r3
+@ CHECK-NEXT: ^
+
+@ We don't want to see a suggestion here; the edit distance is too large to
+@ give sensible suggestions:
+
+ aaaaaaaaaaaaaaa r1, r2, r3
+
+@ CHECK: error: invalid instruction
+@ CHECK-NEXT: aaaaaaaaaaaaaaa r1, r2, r3
+@ CHECK-NEXT: ^
+
+@ Check that we get one suggestion: 'pushh' is 1 edit away, i.e. an deletion.
+
+ pushh r1, r2, r3
+
+@CHECK: error: invalid instruction, did you mean: push?
+@CHECK-NEXT: pushh r1, r2, r3
+@CHECK-NEXT: ^
+
+ adXd r1, r2, r3
+
+@ Check edit distance 1 and 2: 'add' has edit distance of 1 (a deletion),
+@ and 'qadd' a distance of 2 (a deletion and an insertion)
+
+@ CHECK: error: invalid instruction, did you mean: add, qadd?
+@ CHECK-NEXT: adXd r1, r2, r3
+@ CHECK-NEXT: ^
+
+@ Check edit distance 1 and 2, just insertions:
+
+ ad r1, r2, r3
+
+@ CHECK: error: invalid instruction, did you mean: adc, add, adr, and, qadd?
+@ CHECK-NEXT: ad r1, r2, r3
+@ CHECK-NEXT: ^
+
+@ Check an instruction that is 2 edits away, and also has a lot of candidates:
+
+ ldre r1, r2, r3
+
+@ CHECK: error: invalid instruction, did you mean: ldr, ldrb, ldrd, ldrex, ldrexb, ldrexd, ldrexh, ldrh, ldrt?
+@ CHECK-NEXT: ldre r1, r2, r3
+@ CHECK-NEXT: ^
+
+@ Here it is checked that we don't suggest instructions that are not supported.
+@ For example, in Thumb mode we don't want to see suggestions 'faddd' of 'qadd'
+@ because they are not supported.
+
+ fadd r1, r2, r3
+
+@ CHECK-THUMB: error: invalid instruction, did you mean: add?
+@ CHECK-THUMB: fadd r1, r2, r3
+@ CHECK-THUMB: ^
+
+@ CHECK: error: invalid instruction, did you mean: add, qadd?
+@ CHECK-NEXT: fadd r1, r2, r3
+@ CHECK-NEXT: ^
diff --git a/test/MC/ARM/ldr-pseudo-unpredictable.s b/test/MC/ARM/ldr-pseudo-unpredictable.s
index b275dc71ab4b6..ad5a176e0433d 100644
--- a/test/MC/ARM/ldr-pseudo-unpredictable.s
+++ b/test/MC/ARM/ldr-pseudo-unpredictable.s
@@ -1,8 +1,8 @@
@RUN: llvm-mc -triple armv5-unknown-linux-gnueabi %s | FileCheck --check-prefix=CHECK-ARM %s
-@RUN: not llvm-mc -triple thumbv7-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-SP %s
+@RUN: llvm-mc -triple thumbv7-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-T2 %s
@RUN: not llvm-mc -triple thumbv5-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-NONE %s
@RUN: llvm-mc -triple armv5-base-apple-darwin %s | FileCheck --check-prefix=CHECK-DARWIN-ARM %s
-@RUN: not llvm-mc -triple thumbv7-base-apple-darwin %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN-SP %s
+@RUN: llvm-mc -triple thumbv7-base-apple-darwin %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN-T2 %s
@RUN: not llvm-mc -triple thumbv5-base.apple.darwin %s 2>&1 | FileCheck --check-prefix=CHECK-NONE %s
@ We dont't do the transformation for rt = sp or pc
@@ -10,12 +10,12 @@
ldr pc, = 0x4
@ CHECK-ARM: ldr pc, .Ltmp[[TMP0:[0-9]+]]
@ CHECK-DARWIN-ARM: ldr pc, Ltmp0
-@ CHECK-SP: error: instruction requires: arm-mode
-@ CHECK-DARWIN-SP: error: instruction requires: arm-mode
-@ CHECK-NONE: error: instruction requires: arm-mode
+@ CHECK-T2: ldr.w pc, .Ltmp[[TMP0:[0-9]+]]
+@ CHECK-DARWIN-T2: ldr.w pc, Ltmp0
+@ CHECK-NONE: error: instruction requires: thumb2
ldr sp, = 0x8
@ CHECK-ARM: ldr sp, .Ltmp[[TMP1:[0-9]+]]
@ CHECK-DARWIN-ARM: ldr sp, Ltmp1
-@ CHECK-SP: ldr.w sp, .Ltmp[[TMP0:[0-9]+]]
-@ CHECK-DARWIN-SP: ldr.w sp, Ltmp0
-@ CHECK-NONE: error: instruction requires: arm-mode
+@ CHECK-T2: ldr.w sp, .Ltmp[[TMP1:[0-9]+]]
+@ CHECK-DARWIN-T2: ldr.w sp, Ltmp1
+@ CHECK-NONE: error: instruction requires: thumb2
diff --git a/test/MC/COFF/bad-expr.s b/test/MC/COFF/bad-expr.s
index ecbdd415c3a61..cbbd5d0c946f7 100644
--- a/test/MC/COFF/bad-expr.s
+++ b/test/MC/COFF/bad-expr.s
@@ -1,7 +1,6 @@
// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s
-// CHECK: symbol '__ImageBase' can not be undefined in a subtraction expression
-
.data
_x:
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: symbol '__ImageBase' can not be undefined in a subtraction expression
.long _x-__ImageBase
diff --git a/test/MC/COFF/cv-def-range-gap.s b/test/MC/COFF/cv-def-range-gap.s
index 9c1531819963f..29f2def8e1bfc 100644
--- a/test/MC/COFF/cv-def-range-gap.s
+++ b/test/MC/COFF/cv-def-range-gap.s
@@ -2,12 +2,13 @@
# This tries to test defrange gap edge cases.
-# CHECK: Local {
+# CHECK: LocalSym {
# CHECK: Type: int (0x74)
# CHECK: VarName: p
# CHECK: }
-# CHECK-NOT: Local {
-# CHECK: DefRangeRegister {
+# CHECK-NOT: LocalSym {
+# CHECK: DefRangeRegisterSym {
+# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141)
# CHECK-NEXT: Register: 23
# CHECK-NEXT: MayHaveNoName: 0
# CHECK-NEXT: LocalVariableAddrRange {
@@ -20,7 +21,8 @@
# CHECK-NEXT: Range: 0x1
# CHECK-NEXT: ]
# CHECK-NEXT: }
-# CHECK-NEXT: DefRangeRegister {
+# CHECK-NEXT: DefRangeRegisterSym {
+# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141)
# CHECK-NEXT: Register: 23
# CHECK-NEXT: MayHaveNoName: 0
# CHECK-NEXT: LocalVariableAddrRange {
@@ -29,7 +31,8 @@
# CHECK-NEXT: Range: 0x6
# CHECK-NEXT: }
# CHECK-NEXT: }
-# CHECK-NEXT: DefRangeRegister {
+# CHECK-NEXT: DefRangeRegisterSym {
+# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141)
# CHECK-NEXT: Register: 23
# CHECK-NEXT: MayHaveNoName: 0
# CHECK-NEXT: LocalVariableAddrRange {
@@ -38,7 +41,8 @@
# CHECK-NEXT: Range: 0x1
# CHECK-NEXT: }
# CHECK-NEXT: }
-# CHECK-NEXT: DefRangeRegister {
+# CHECK-NEXT: DefRangeRegisterSym {
+# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141)
# CHECK-NEXT: Register: 23
# CHECK-NEXT: MayHaveNoName: 0
# CHECK-NEXT: LocalVariableAddrRange {
diff --git a/test/MC/COFF/cv-def-range.s b/test/MC/COFF/cv-def-range.s
index 5ac0df7f7d964..7a90ec263683c 100644
--- a/test/MC/COFF/cv-def-range.s
+++ b/test/MC/COFF/cv-def-range.s
@@ -77,18 +77,18 @@ Ltmp3:
.short 4431 # Record kind: S_PROC_ID_END
.cv_def_range Lvar_begin0 Lvar_end0, "\102\021\374\377\377\377"
-# CHECK: DefRangeFramePointerRel {
+# CHECK: DefRangeFramePointerRelSym {
# CHECK: Offset: -4
# CHECK: LocalVariableAddrRange {
# CHECK: OffsetStart: .text+0x9
# CHECK: ISectStart: 0x0
# CHECK: Range: 0xF
# CHECK: }
+# CHECK: BlockRelocations [
+# CHECK: 0x4 IMAGE_REL_I386_SECREL .text
+# CHECK: 0x8 IMAGE_REL_I386_SECTION .text
+# CHECK: ]
# CHECK: }
-# CHECK: BlockRelocations [
-# CHECK: 0x4 IMAGE_REL_I386_SECREL .text
-# CHECK: 0x8 IMAGE_REL_I386_SECTION .text
-# CHECK: ]
Ltmp1:
.p2align 2
diff --git a/test/MC/COFF/cv-inline-linetable-infloop.s b/test/MC/COFF/cv-inline-linetable-infloop.s
index 804ed6f404d99..6b8e708befc4e 100644
--- a/test/MC/COFF/cv-inline-linetable-infloop.s
+++ b/test/MC/COFF/cv-inline-linetable-infloop.s
@@ -1,6 +1,6 @@
# RUN: llvm-mc -triple=x86_64-pc-win32 -filetype=obj < %s | llvm-readobj -codeview | FileCheck %s
-# CHECK: InlineSite {
+# CHECK: InlineSiteSym {
# CHECK: BinaryAnnotations [
# CHECK: ChangeLineOffset: 1
# CHECK: ChangeCodeLength: 0x2
diff --git a/test/MC/COFF/cv-inline-linetable-unlikely.s b/test/MC/COFF/cv-inline-linetable-unlikely.s
index dd3a66f419cc4..bfb745bd9bb1e 100644
--- a/test/MC/COFF/cv-inline-linetable-unlikely.s
+++ b/test/MC/COFF/cv-inline-linetable-unlikely.s
@@ -19,13 +19,13 @@
# calls to __asan_report*, for which it is very important to have an accurate
# stack trace.
-# CHECK: ProcStart {
+# CHECK: GlobalProcIdSym {
# CHECK: FunctionType: g (0x1003)
# CHECK: CodeOffset: g+0x0
# CHECK: DisplayName: g
# CHECK: LinkageName: g
# CHECK: }
-# CHECK: InlineSite {
+# CHECK: InlineSiteSym {
# CHECK: Inlinee: f (0x1002)
# CHECK: BinaryAnnotations [
# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xE, LineOffset: 1}
diff --git a/test/MC/COFF/cv-inline-linetable-unreachable.s b/test/MC/COFF/cv-inline-linetable-unreachable.s
index 0f29d1667c359..d894fc758fb15 100644
--- a/test/MC/COFF/cv-inline-linetable-unreachable.s
+++ b/test/MC/COFF/cv-inline-linetable-unreachable.s
@@ -76,7 +76,7 @@ Ltmp6:
.short 4429
.asciz "\000\000\000\000\000\000\000\000\003\020\000"
.cv_inline_linetable 1 1 3 Lfunc_begin0 Lfunc_end0
-# CHECK: InlineSite {
+# CHECK: InlineSiteSym {
# CHECK: PtrParent: 0x0
# CHECK: PtrEnd: 0x0
# CHECK: Inlinee: f (0x1003)
diff --git a/test/MC/COFF/cv-inline-linetable.s b/test/MC/COFF/cv-inline-linetable.s
index bb68fcde21be2..2c89f9836c423 100644
--- a/test/MC/COFF/cv-inline-linetable.s
+++ b/test/MC/COFF/cv-inline-linetable.s
@@ -88,7 +88,7 @@ Ltmp4:
.short 4429
.asciz "\000\000\000\000\000\000\000\000\003\020\000"
.cv_inline_linetable 1 1 9 Lfunc_begin0 Lfunc_end0
-# CHECK: InlineSite {
+# CHECK: InlineSiteSym {
# CHECK: PtrParent: 0x0
# CHECK: PtrEnd: 0x0
# CHECK: Inlinee: bar (0x1003)
@@ -106,7 +106,7 @@ Ltmp6:
.short 4429
.asciz "\000\000\000\000\000\000\000\000\004\020\000"
.cv_inline_linetable 2 1 3 Lfunc_begin0 Lfunc_end0
-# CHECK: InlineSite {
+# CHECK: InlineSiteSym {
# CHECK: PtrParent: 0x0
# CHECK: PtrEnd: 0x0
# CHECK: Inlinee: foo (0x1004)
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
new file mode 100644
index 0000000000000..62e7092086aa0
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
@@ -0,0 +1,32 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mt | FileCheck %s
+0xc1 0x0b 0x60 0x41 # CHECK: dmt
+0xc1 0x0b 0x65 0x41 # CHECK: dmt $5
+0xe1 0x0b 0x60 0x41 # CHECK: emt
+0xe1 0x0b 0x64 0x41 # CHECK: emt $4
+0x01 0x00 0x60 0x41 # CHECK: dvpe
+0x01 0x00 0x66 0x41 # CHECK: dvpe $6
+0x21 0x00 0x60 0x41 # CHECK: evpe
+0x21 0x00 0x64 0x41 # CHECK: evpe $4
+0x08 0x10 0x65 0x7c # CHECK: fork $2, $3, $5
+0x09 0x00 0x80 0x7c # CHECK: yield $4
+0x09 0x20 0xa0 0x7c # CHECK: yield $4, $5
+0x02 0x20 0x05 0x41 # CHECK: mftr $4, $5, 0, 2, 0
+0x20 0x20 0x05 0x41 # CHECK: mftr $4, $5, 1, 0, 0
+0x21 0x20 0x00 0x41 # CHECK: mftr $4, $zero, 1, 1, 0
+0x21 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 1, 0
+0x22 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 0
+0x32 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 1
+0x23 0x20 0x1a 0x41 # CHECK: mftr $4, $26, 1, 3, 0
+0x23 0x20 0x1f 0x41 # CHECK: mftr $4, $ra, 1, 3, 0
+0x24 0x20 0x0e 0x41 # CHECK: mftr $4, $14, 1, 4, 0
+0x25 0x20 0x0f 0x41 # CHECK: mftr $4, $15, 1, 5, 0
+0x02 0x28 0x84 0x41 # CHECK: mttr $4, $5, 0, 2, 0
+0x20 0x28 0x84 0x41 # CHECK: mttr $4, $5, 1, 0, 0
+0x21 0x00 0x84 0x41 # CHECK: mttr $4, $zero, 1, 1, 0
+0x21 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 1, 0
+0x22 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 0
+0x32 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 1
+0x23 0xd0 0x84 0x41 # CHECK: mttr $4, $26, 1, 3, 0
+0x23 0xf8 0x84 0x41 # CHECK: mttr $4, $ra, 1, 3, 0
+0x24 0x70 0x84 0x41 # CHECK: mttr $4, $14, 1, 4, 0
+0x25 0x78 0x84 0x41 # CHECK: mttr $4, $15, 1, 5, 0
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2.txt b/test/MC/Disassembler/Mips/mt/valid-r2.txt
new file mode 100644
index 0000000000000..4786d8b5591f4
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mt/valid-r2.txt
@@ -0,0 +1,32 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+mt | FileCheck %s
+0x41 0x60 0x0b 0xc1 # CHECK: dmt
+0x41 0x65 0x0b 0xc1 # CHECK: dmt $5
+0x41 0x60 0x0b 0xe1 # CHECK: emt
+0x41 0x64 0x0b 0xe1 # CHECK: emt $4
+0x41 0x60 0x00 0x01 # CHECK: dvpe
+0x41 0x66 0x00 0x01 # CHECK: dvpe $6
+0x41 0x60 0x00 0x21 # CHECK: evpe
+0x41 0x64 0x00 0x21 # CHECK: evpe $4
+0x7c 0x65 0x10 0x08 # CHECK: fork $2, $3, $5
+0x7c 0x80 0x00 0x09 # CHECK: yield $4
+0x7c 0xa0 0x20 0x09 # CHECK: yield $4, $5
+0x41 0x05 0x20 0x02 # CHECK: mftr $4, $5, 0, 2, 0
+0x41 0x05 0x20 0x20 # CHECK: mftr $4, $5, 1, 0, 0
+0x41 0x00 0x20 0x21 # CHECK: mftr $4, $zero, 1, 1, 0
+0x41 0x0a 0x20 0x21 # CHECK: mftr $4, $10, 1, 1, 0
+0x41 0x0a 0x20 0x22 # CHECK: mftr $4, $10, 1, 2, 0
+0x41 0x0a 0x20 0x32 # CHECK: mftr $4, $10, 1, 2, 1
+0x41 0x1a 0x20 0x23 # CHECK: mftr $4, $26, 1, 3, 0
+0x41 0x1f 0x20 0x23 # CHECK: mftr $4, $ra, 1, 3, 0
+0x41 0x0e 0x20 0x24 # CHECK: mftr $4, $14, 1, 4, 0
+0x41 0x0f 0x20 0x25 # CHECK: mftr $4, $15, 1, 5, 0
+0x41 0x84 0x28 0x02 # CHECK: mttr $4, $5, 0, 2, 0
+0x41 0x84 0x28 0x20 # CHECK: mttr $4, $5, 1, 0, 0
+0x41 0x84 0x00 0x21 # CHECK: mttr $4, $zero, 1, 1, 0
+0x41 0x84 0x50 0x21 # CHECK: mttr $4, $10, 1, 1, 0
+0x41 0x84 0x50 0x22 # CHECK: mttr $4, $10, 1, 2, 0
+0x41 0x84 0x50 0x32 # CHECK: mttr $4, $10, 1, 2, 1
+0x41 0x84 0xd0 0x23 # CHECK: mttr $4, $26, 1, 3, 0
+0x41 0x84 0xf8 0x23 # CHECK: mttr $4, $ra, 1, 3, 0
+0x41 0x84 0x70 0x24 # CHECK: mttr $4, $14, 1, 4, 0
+0x41 0x84 0x78 0x25 # CHECK: mttr $4, $15, 1, 5, 0
diff --git a/test/MC/ELF/bad-expr3.s b/test/MC/ELF/bad-expr3.s
index 990167cda53f8..cf5d6f47335f5 100644
--- a/test/MC/ELF/bad-expr3.s
+++ b/test/MC/ELF/bad-expr3.s
@@ -1,8 +1,7 @@
// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o /dev/null \
// RUN: 2>&1 | FileCheck %s
-// CHECK: Cannot represent a difference across sections
-
+// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: Cannot represent a difference across sections
.long foo - bar
.section .zed
foo:
diff --git a/test/MC/Mips/addend.s b/test/MC/Mips/addend.s
new file mode 100644
index 0000000000000..93ce4f413aebe
--- /dev/null
+++ b/test/MC/Mips/addend.s
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux -mcpu=mips32r6 %s -o %t.o
+# RUN: llvm-readobj -s -section-data %t.o | FileCheck %s
+
+# CHECK: Name: .text
+# CHECK-NEXT: Type:
+# CHECK-NEXT: Flags [
+# CHECK-NEXT: SHF_ALLOC
+# CHECK-NEXT: SHF_EXECINSTR
+# CHECK-NEXT: ]
+# CHECK-NEXT: Address:
+# CHECK-NEXT: Offset:
+# CHECK-NEXT: Size:
+# CHECK-NEXT: Link:
+# CHECK-NEXT: Info:
+# CHECK-NEXT: AddressAlignment:
+# CHECK-NEXT: EntrySize:
+# CHECK-NEXT: SectionData (
+# CHECK-NEXT: 0000: 00000008 |
+# CHECK-NEXT: )
+
+ .word _foo+8-.
diff --git a/test/MC/Mips/mt/abiflag.s b/test/MC/Mips/mt/abiflag.s
new file mode 100644
index 0000000000000..b4769cba4c2d1
--- /dev/null
+++ b/test/MC/Mips/mt/abiflag.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -mattr=+mt -filetype=obj -o - \
+# RUN: | llvm-readobj -mips-abi-flags | FileCheck %s
+
+# Test that the usage of the MT ASE is recorded in .MIPS.abiflags
+
+# CHECK: ASEs
+# CHECK-NEXT: MT (0x40)
+
+ .text
+ nop
diff --git a/test/MC/Mips/mt/invalid-wrong-error.s b/test/MC/Mips/mt/invalid-wrong-error.s
new file mode 100644
index 0000000000000..0247089b70ae6
--- /dev/null
+++ b/test/MC/Mips/mt/invalid-wrong-error.s
@@ -0,0 +1,3 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt < %s 2>&1 | FileCheck %s
+ mftr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
+ mttr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/invalid.s b/test/MC/Mips/mt/invalid.s
new file mode 100644
index 0000000000000..d4055c4a50f44
--- /dev/null
+++ b/test/MC/Mips/mt/invalid.s
@@ -0,0 +1,27 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips32 -mattr=+mt < %s 2>&1 | FileCheck %s
+ dmt 4 # CHECK: error: invalid operand for instruction
+ dmt $4, $5 # CHECK: error: invalid operand for instruction
+ dmt $5, 0($4) # CHECK: error: invalid operand for instruction
+ emt 4 # CHECK: error: invalid operand for instruction
+ emt $4, $5 # CHECK: error: invalid operand for instruction
+ emt $5, 0($5) # CHECK: error: invalid operand for instruction
+ dvpe 4 # CHECK: error: invalid operand for instruction
+ dvpe $4, $5 # CHECK: error: invalid operand for instruction
+ dvpe $5, 0($4) # CHECK: error: invalid operand for instruction
+ evpe 4 # CHECK: error: invalid operand for instruction
+ evpe $4, $5 # CHECK: error: invalid operand for instruction
+ evpe $5, 0($5) # CHECK: error: invalid operand for instruction
+ mftr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
+ mftr $4, $5, 2, 0, 0 # CHECK: error: expected 1-bit unsigned immediate
+ mftr $4, $5, -1, 0, 0 # CHECK: error: expected 1-bit unsigned immediate
+ mftr $4, $5, 0, 8, 0 # CHECK: error: expected 3-bit unsigned immediate
+ mftr $4, $5, 0, -1, 0 # CHECK: error: expected 3-bit unsigned immediate
+ mftr $4, $4, 0, 0, 2 # CHECK: error: expected 1-bit unsigned immediate
+ mftr $4, $5, 0, 0, -1 # CHECK: error: expected 1-bit unsigned immediate
+ mttr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
+ mttr $4, $5, 2, 0, 0 # CHECK: error: expected 1-bit unsigned immediate
+ mttr $4, $5, -1, 0, 0 # CHECK: error: expected 1-bit unsigned immediate
+ mttr $4, $5, 0, 8, 0 # CHECK: error: expected 3-bit unsigned immediate
+ mttr $4, $5, 0, -1, 0 # CHECK: error: expected 3-bit unsigned immediate
+ mttr $4, $4, 0, 0, 2 # CHECK: error: expected 1-bit unsigned immediate
+ mttr $4, $5, 0, 0, -1 # CHECK: error: expected 1-bit unsigned immediate
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
new file mode 100644
index 0000000000000..4e872412e6ef2
--- /dev/null
+++ b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
@@ -0,0 +1,18 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
+# RUN: 2>&1 | FileCheck %s
+
+# The integrated assembler produces a wrong or misleading error message.
+
+ mftc0 0($4), $5 # CHECK: error: unexpected token in argument list
+ mftc0 0($4), $5, 1 # CHECK: error: unexpected token in argument list
+ mftgpr 0($4), $5 # CHECK: error: unexpected token in argument list
+ mftlo 0($3) # CHECK: error: unexpected token in argument list
+ mftlo 0($3), $ac1 # CHECK: error: unexpected token in argument list
+ mfthi 0($3) # CHECK: error: unexpected token in argument list
+ mfthi 0($3), $ac1 # CHECK: error: unexpected token in argument list
+ mftacx 0($3) # CHECK: error: unexpected token in argument list
+ mftacx 0($3), $ac1 # CHECK: error: unexpected token in argument list
+ mftdsp 0($4) # CHECK: error: unexpected token in argument list
+ mftc1 0($4), $f4 # CHECK: error: unexpected token in argument list
+ mfthc1 0($4), $f4 # CHECK: error: unexpected token in argument list
+ cftc1 0($4), $f8 # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
new file mode 100644
index 0000000000000..06ae8c72e654a
--- /dev/null
+++ b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
@@ -0,0 +1,23 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
+# RUN: 2>&1 | FileCheck %s
+
+ mftc0 $4, 0($5) # CHECK: error: invalid operand for instruction
+ mftc0 $4, 0($5), 1 # CHECK: error: invalid operand for instruction
+ mftc0 $4, $5, -1 # CHECK: error: expected 3-bit unsigned immediate
+ mftc0 $4, $5, 9 # CHECK: error: expected 3-bit unsigned immediate
+ mftc0 $4, $5, $6 # CHECK: error: expected 3-bit unsigned immediate
+ mftgpr $4, 0($5) # CHECK: error: invalid operand for instruction
+ mftgpr $4, $5, $6 # CHECK: error: invalid operand for instruction
+ mftlo $3, 0($ac1) # CHECK: error: invalid operand for instruction
+ mftlo $4, $ac1, $4 # CHECK: error: invalid operand for instruction
+ mfthi $3, 0($ac1) # CHECK: error: invalid operand for instruction
+ mfthi $4, $ac1, $4 # CHECK: error: invalid operand for instruction
+ mftacx $3, 0($ac1) # CHECK: error: invalid operand for instruction
+ mftacx $4, $ac1, $4 # CHECK: error: invalid operand for instruction
+ mftdsp $4, $5 # CHECK: error: invalid operand for instruction
+ mftdsp $4, $f5 # CHECK: error: invalid operand for instruction
+ mftdsp $4, $ac0 # CHECK: error: invalid operand for instruction
+ mftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction
+ mfthc1 $4, 0($f4) # CHECK: error: invalid operand for instruction
+ cftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction
+ cftc1 $4, $f4, $5 # CHECK: error: invalid operand for instruction
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases.s b/test/MC/Mips/mt/mftr-mttr-aliases.s
new file mode 100644
index 0000000000000..92ed9f9281f20
--- /dev/null
+++ b/test/MC/Mips/mt/mftr-mttr-aliases.s
@@ -0,0 +1,47 @@
+# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
+
+# Check the various aliases of the m[ft]tr instruction.
+
+ mftc0 $4, $5 # CHECK: mftr $4, $5, 0, 0, 0 # encoding: [0x41,0x05,0x20,0x00]
+ mftc0 $6, $7, 1 # CHECK: mftr $6, $7, 0, 1, 0 # encoding: [0x41,0x07,0x30,0x01]
+ mftgpr $5, $9 # CHECK: mftr $5, $9, 1, 0, 0 # encoding: [0x41,0x09,0x28,0x20]
+ mftlo $3 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21]
+ mftlo $3, $ac0 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21]
+ mftlo $3, $ac1 # CHECK: mftr $3, $4, 1, 1, 0 # encoding: [0x41,0x04,0x18,0x21]
+ mftlo $3, $ac2 # CHECK: mftr $3, $8, 1, 1, 0 # encoding: [0x41,0x08,0x18,0x21]
+ mftlo $3, $ac3 # CHECK: mftr $3, $12, 1, 1, 0 # encoding: [0x41,0x0c,0x18,0x21]
+ mfthi $3, $ac0 # CHECK: mftr $3, $1, 1, 1, 0 # encoding: [0x41,0x01,0x18,0x21]
+ mfthi $3, $ac1 # CHECK: mftr $3, $5, 1, 1, 0 # encoding: [0x41,0x05,0x18,0x21]
+ mfthi $3, $ac2 # CHECK: mftr $3, $9, 1, 1, 0 # encoding: [0x41,0x09,0x18,0x21]
+ mfthi $3, $ac3 # CHECK: mftr $3, $13, 1, 1, 0 # encoding: [0x41,0x0d,0x18,0x21]
+ mftacx $3, $ac0 # CHECK: mftr $3, $2, 1, 1, 0 # encoding: [0x41,0x02,0x18,0x21]
+ mftacx $3, $ac1 # CHECK: mftr $3, $6, 1, 1, 0 # encoding: [0x41,0x06,0x18,0x21]
+ mftacx $3, $ac2 # CHECK: mftr $3, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x18,0x21]
+ mftacx $3, $ac3 # CHECK: mftr $3, $14, 1, 1, 0 # encoding: [0x41,0x0e,0x18,0x21]
+ mftdsp $4 # CHECK: mftr $4, $16, 1, 1, 0 # encoding: [0x41,0x10,0x20,0x21]
+ mftc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 0 # encoding: [0x41,0x05,0x20,0x22]
+ mfthc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 1 # encoding: [0x41,0x05,0x20,0x32]
+ cftc1 $4, $f9 # CHECK: mftr $4, $9, 1, 3, 0 # encoding: [0x41,0x09,0x20,0x23]
+
+ mttc0 $4, $5 # CHECK: mttr $4, $5, 0, 0, 0 # encoding: [0x41,0x84,0x28,0x00]
+ mttc0 $6, $7, 1 # CHECK: mttr $6, $7, 0, 1, 0 # encoding: [0x41,0x86,0x38,0x01]
+ mttgpr $5, $9 # CHECK: mttr $5, $9, 1, 0, 0 # encoding: [0x41,0x85,0x48,0x20]
+ mttlo $3 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21]
+ mttlo $3, $ac0 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21]
+ mttlo $3, $ac1 # CHECK: mttr $3, $4, 1, 1, 0 # encoding: [0x41,0x83,0x20,0x21]
+ mttlo $3, $ac2 # CHECK: mttr $3, $8, 1, 1, 0 # encoding: [0x41,0x83,0x40,0x21]
+ mttlo $3, $ac3 # CHECK: mttr $3, $12, 1, 1, 0 # encoding: [0x41,0x83,0x60,0x21]
+ mtthi $3 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21]
+ mtthi $3, $ac0 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21]
+ mtthi $3, $ac1 # CHECK: mttr $3, $5, 1, 1, 0 # encoding: [0x41,0x83,0x28,0x21]
+ mtthi $3, $ac2 # CHECK: mttr $3, $9, 1, 1, 0 # encoding: [0x41,0x83,0x48,0x21]
+ mtthi $3, $ac3 # CHECK: mttr $3, $13, 1, 1, 0 # encoding: [0x41,0x83,0x68,0x21]
+ mttacx $3 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21]
+ mttacx $3, $ac0 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21]
+ mttacx $3, $ac1 # CHECK: mttr $3, $6, 1, 1, 0 # encoding: [0x41,0x83,0x30,0x21]
+ mttacx $3, $ac2 # CHECK: mttr $3, $10, 1, 1, 0 # encoding: [0x41,0x83,0x50,0x21]
+ mttacx $3, $ac3 # CHECK: mttr $3, $14, 1, 1, 0 # encoding: [0x41,0x83,0x70,0x21]
+ mttdsp $4 # CHECK: mttr $4, $16, 1, 1, 0 # encoding: [0x41,0x84,0x80,0x21]
+ mttc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 0 # encoding: [0x41,0x84,0x28,0x22]
+ mtthc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 1 # encoding: [0x41,0x84,0x28,0x32]
+ cttc1 $4, $f9 # CHECK: mttr $4, $9, 1, 3, 0 # encoding: [0x41,0x84,0x48,0x23]
diff --git a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
new file mode 100644
index 0000000000000..c40e81bfc7d75
--- /dev/null
+++ b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
+
+# The selector value and register values here are marked as reserved in the
+# documentation, but GAS accepts them without warning.
+ mftr $31, $31, 1, 1, 0 # CHECK: mftr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x1f,0xf8,0x21]
+ mttr $31, $31, 1, 1, 0 # CHECK: mttr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x9f,0xf8,0x21]
+ mftr $31, $13, 1, 6, 0 # CHECK: mftr $ra, $13, 1, 6, 0 # encoding: [0x41,0x0d,0xf8,0x26]
+ mttr $31, $13, 1, 6, 0 # CHECK: mttr $ra, $13, 1, 6, 0 # encoding: [0x41,0x9f,0x68,0x26]
diff --git a/test/MC/Mips/mt/module-directive-invalid.s b/test/MC/Mips/mt/module-directive-invalid.s
new file mode 100644
index 0000000000000..38baaa07cdc17
--- /dev/null
+++ b/test/MC/Mips/mt/module-directive-invalid.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips32r5 < %s 2>&1 | FileCheck %s
+
+# CHECK: error: .module directive must appear before any code
+ .set nomips16
+ .module mt
+ nop
diff --git a/test/MC/Mips/mt/module-directive.s b/test/MC/Mips/mt/module-directive.s
new file mode 100644
index 0000000000000..d316f054eaae3
--- /dev/null
+++ b/test/MC/Mips/mt/module-directive.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN: llvm-readobj -mips-abi-flags | FileCheck --check-prefix=CHECK-OBJ %s
+# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \
+# RUN: FileCheck --check-prefix=CHECK-ASM %s
+
+# Test that the .module directive sets the MT flag in .MIPS.abiflags when
+# assembling to boject files.
+
+# Test that the .moodule directive is re-emitted when expanding assembly.
+
+# CHECK-OBJ: ASEs
+# CHECK-OBJ-NEXT: MT (0x40)
+
+# CHECK-ASM: .module mt
+.module mt
+nop
diff --git a/test/MC/Mips/mt/set-directive.s b/test/MC/Mips/mt/set-directive.s
new file mode 100644
index 0000000000000..53ed4b273795d
--- /dev/null
+++ b/test/MC/Mips/mt/set-directive.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN: llvm-readobj -mips-abi-flags | FileCheck %s --check-prefix=CHECK-OBJ
+# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \
+# RUN: FileCheck %s --check-prefix=CHECK-ASM
+
+# Test that the MT ASE flag in .MIPS.abiflags is _not_ set by .set.
+# Test that '.set mt' is emitted by the asm target streamer.
+
+# CHECK-OBJ: ASEs
+# CHECK-OBJ-NOT: MT (0x40)
+
+# CHECK-ASM: .set mt
+ .set mt
+ nop
diff --git a/test/MC/Mips/mt/valid.s b/test/MC/Mips/mt/valid.s
new file mode 100644
index 0000000000000..9fa07870a61f2
--- /dev/null
+++ b/test/MC/Mips/mt/valid.s
@@ -0,0 +1,33 @@
+# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
+# RUN: | FileCheck %s
+ dmt # CHECK: dmt # encoding: [0x41,0x60,0x0b,0xc1]
+ dmt $5 # CHECK: dmt $5 # encoding: [0x41,0x65,0x0b,0xc1]
+ emt # CHECK: emt # encoding: [0x41,0x60,0x0b,0xe1]
+ emt $4 # CHECK: emt $4 # encoding: [0x41,0x64,0x0b,0xe1]
+ dvpe # CHECK: dvpe # encoding: [0x41,0x60,0x00,0x01]
+ dvpe $6 # CHECK: dvpe $6 # encoding: [0x41,0x66,0x00,0x01]
+ evpe # CHECK: evpe # encoding: [0x41,0x60,0x00,0x21]
+ evpe $4 # CHECK: evpe $4 # encoding: [0x41,0x64,0x00,0x21]
+ fork $2, $3, $5 # CHECK: fork $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08]
+ yield $4 # CHECK: yield $4 # encoding: [0x7c,0x80,0x00,0x09]
+ yield $4, $5 # CHECK: yield $4, $5 # encoding: [0x7c,0xa0,0x20,0x09]
+ mftr $4, $5, 0, 2, 0 # CHECK: mftr $4, $5, 0, 2, 0 # encoding: [0x41,0x05,0x20,0x02]
+ mftr $4, $5, 1, 0, 0 # CHECK: mftr $4, $5, 1, 0, 0 # encoding: [0x41,0x05,0x20,0x20]
+ mftr $4, $0, 1, 1, 0 # CHECK: mftr $4, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x20,0x21]
+ mftr $4, $10, 1, 1, 0 # CHECK: mftr $4, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x20,0x21]
+ mftr $4, $10, 1, 2, 0 # CHECK: mftr $4, $10, 1, 2, 0 # encoding: [0x41,0x0a,0x20,0x22]
+ mftr $4, $10, 1, 2, 1 # CHECK: mftr $4, $10, 1, 2, 1 # encoding: [0x41,0x0a,0x20,0x32]
+ mftr $4, $26, 1, 3, 0 # CHECK: mftr $4, $26, 1, 3, 0 # encoding: [0x41,0x1a,0x20,0x23]
+ mftr $4, $31, 1, 3, 0 # CHECK: mftr $4, $ra, 1, 3, 0 # encoding: [0x41,0x1f,0x20,0x23]
+ mftr $4, $14, 1, 4, 0 # CHECK: mftr $4, $14, 1, 4, 0 # encoding: [0x41,0x0e,0x20,0x24]
+ mftr $4, $15, 1, 5, 0 # CHECK: mftr $4, $15, 1, 5, 0 # encoding: [0x41,0x0f,0x20,0x25]
+ mttr $4, $5, 0, 2, 0 # CHECK: mttr $4, $5, 0, 2, 0 # encoding: [0x41,0x84,0x28,0x02]
+ mttr $4, $5, 1, 0, 0 # CHECK: mttr $4, $5, 1, 0, 0 # encoding: [0x41,0x84,0x28,0x20]
+ mttr $4, $0, 1, 1, 0 # CHECK: mttr $4, $zero, 1, 1, 0 # encoding: [0x41,0x84,0x00,0x21]
+ mttr $4, $10, 1, 1, 0 # CHECK: mttr $4, $10, 1, 1, 0 # encoding: [0x41,0x84,0x50,0x21]
+ mttr $4, $10, 1, 2, 0 # CHECK: mttr $4, $10, 1, 2, 0 # encoding: [0x41,0x84,0x50,0x22]
+ mttr $4, $10, 1, 2, 1 # CHECK: mttr $4, $10, 1, 2, 1 # encoding: [0x41,0x84,0x50,0x32]
+ mttr $4, $26, 1, 3, 0 # CHECK: mttr $4, $26, 1, 3, 0 # encoding: [0x41,0x84,0xd0,0x23]
+ mttr $4, $31, 1, 3, 0 # CHECK: mttr $4, $ra, 1, 3, 0 # encoding: [0x41,0x84,0xf8,0x23]
+ mttr $4, $14, 1, 4, 0 # CHECK: mttr $4, $14, 1, 4, 0 # encoding: [0x41,0x84,0x70,0x24]
+ mttr $4, $15, 1, 5, 0 # CHECK: mttr $4, $15, 1, 5, 0 # encoding: [0x41,0x84,0x78,0x25]
diff --git a/test/MC/WebAssembly/array-fill.ll b/test/MC/WebAssembly/array-fill.ll
new file mode 100644
index 0000000000000..4feabc0748e0f
--- /dev/null
+++ b/test/MC/WebAssembly/array-fill.ll
@@ -0,0 +1,14 @@
+; RUN: llc -filetype=obj %s -o - | obj2yaml | FileCheck %s
+; PR33624
+
+source_filename = "ws.c"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-wasm"
+
+%struct.bd = type { i8 }
+
+@gBd = hidden global [2 x %struct.bd] [%struct.bd { i8 1 }, %struct.bd { i8 2 }], align 1
+
+; CHECK: - Type: DATA
+; CHECK: Content: '0102'
+; CHECK: DataSize: 2
diff --git a/test/MC/WebAssembly/external-data.ll b/test/MC/WebAssembly/external-data.ll
index 6914736ac671a..b8c97453413e2 100644
--- a/test/MC/WebAssembly/external-data.ll
+++ b/test/MC/WebAssembly/external-data.ll
@@ -13,7 +13,8 @@
; CHECK: Index: 0
; CHECK: Offset: 0x0000000E
; CHECK: Segments:
-; CHECK: - Index: 0
+; CHECK: - SectionOffset: 6
+; CHECK: MemoryIndex: 0
; CHECK: Offset:
; CHECK: Opcode: I32_CONST
; CHECK: Value: 0
diff --git a/test/MC/WebAssembly/external-func-address.ll b/test/MC/WebAssembly/external-func-address.ll
index 4022b2c9bae97..53da9805f9871 100644
--- a/test/MC/WebAssembly/external-func-address.ll
+++ b/test/MC/WebAssembly/external-func-address.ll
@@ -2,24 +2,33 @@
; Verify that addresses of external functions generate correctly typed
; imports and relocations or type R_TABLE_INDEX_I32.
-declare void @f1() #1
-@ptr_to_f1 = hidden global void ()* @f1, align 4
+declare void @f1(i32) #1
+@ptr_to_f1 = hidden global void (i32)* @f1, align 4
-
-; CHECK: - Type: IMPORT
-; CHECK: Imports:
-; CHECK: - Module: env
-; CHECK: Field: f1
-; CHECK: Kind: FUNCTION
-; CHECK: SigIndex: 0
-; CHECK: - Type: ELEM
-; CHECK: Segments:
-; CHECK: - Offset:
-; CHECK: Opcode: I32_CONST
-; CHECK: Value: 0
-; CHECK: Functions: [ 0 ]
-; CHECK: - Type: DATA
-; CHECK: Relocations:
-; CHECK: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32
-; CHECK: Index: 0
-; CHECK: Offset: 0x00000006
+; CHECK: --- !WASM
+; CHECK-NEXT: FileHeader:
+; CHECK-NEXT: Version: 0x00000001
+; CHECK-NEXT: Sections:
+; CHECK-NEXT: - Type: TYPE
+; CHECK-NEXT: Signatures:
+; CHECK-NEXT: - Index: 0
+; CHECK-NEXT: ReturnType: NORESULT
+; CHECK-NEXT: ParamTypes:
+; CHECK-NEXT: - I32
+; CHECK: - Type: IMPORT
+; CHECK-NEXT: Imports:
+; CHECK-NEXT: - Module: env
+; CHECK-NEXT: Field: f1
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: SigIndex: 0
+; CHECK: - Type: ELEM
+; CHECK-NEXT: Segments:
+; CHECK-NEXT: - Offset:
+; CHECK-NEXT: Opcode: I32_CONST
+; CHECK-NEXT: Value: 0
+; CHECK-NEXT: Functions: [ 0 ]
+; CHECK: - Type: DATA
+; CHECK-NEXT: Relocations:
+; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32
+; CHECK-NEXT: Index: 0
+; CHECK-NEXT: Offset: 0x00000006
diff --git a/test/MC/WebAssembly/unnamed-data.ll b/test/MC/WebAssembly/unnamed-data.ll
index fd985088c1d27..fa0ff966a79fd 100644
--- a/test/MC/WebAssembly/unnamed-data.ll
+++ b/test/MC/WebAssembly/unnamed-data.ll
@@ -46,7 +46,8 @@
; CHECK-NEXT: Index: 1
; CHECK-NEXT: Offset: 0x0000001E
; CHECK-NEXT: Segments:
-; CHECK-NEXT: - Index: 0
+; CHECK-NEXT: - SectionOffset: 6
+; CHECK-NEXT: MemoryIndex: 0
; CHECK-NEXT: Offset:
; CHECK-NEXT: Opcode: I32_CONST
; CHECK-NEXT: Value: 0
diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll
index 6e2b8631d2b17..1d80ea4aac6c1 100644
--- a/test/MC/WebAssembly/weak-alias.ll
+++ b/test/MC/WebAssembly/weak-alias.ll
@@ -3,27 +3,56 @@
; foo_alias() function is weak alias of function foo()
; Generates two exports of the same function, one of them weak
-@foo_alias = weak hidden alias i32 (...), bitcast (i32 ()* @foo to i32 (...)*)
+@foo_alias = weak hidden alias i32 (), i32 ()* @foo
+
+define hidden i32 @call_alias() #0 {
+entry:
+ %call = call i32 @foo_alias()
+ ret i32 %call
+}
define hidden i32 @foo() #0 {
entry:
ret i32 0
}
+
+; CHECK: - Type: TYPE
+; CHECK-NEXT: Signatures:
+; CHECK-NEXT: - Index: 0
+; CHECK-NEXT: ReturnType: I32
+; CHECK-NEXT: ParamTypes:
+
+; CHECK: - Type: IMPORT
+; CHECK-NEXT: Imports:
+; CHECK-NEXT: - Module: env
+; CHECK-NEXT: Field: foo_alias
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: SigIndex: 0
+
+; CHECK: - Type: FUNCTION
+; CHECK-NEXT: FunctionTypes: [ 0, 0 ]
+
; CHECK: - Type: EXPORT
; CHECK-NEXT: Exports:
+; CHECK-NEXT: - Name: call_alias
+; CHECK-NEXT: Kind: FUNCTION
+; CHECK-NEXT: Index: 1
; CHECK-NEXT: - Name: foo
; CHECK-NEXT: Kind: FUNCTION
-; CHECK-NEXT: Index: 0
+; CHECK-NEXT: Index: 2
; CHECK-NEXT: - Name: foo_alias
; CHECK-NEXT: Kind: FUNCTION
-; CHECK-NEXT: Index: 0
-
+; CHECK-NEXT: Index: 2
; CHECK: - Type: CUSTOM
; CHECK-NEXT: Name: name
; CHECK-NEXT: FunctionNames:
; CHECK-NEXT: - Index: 0
+; CHECK-NEXT: Name: foo_alias
+; CHECK-NEXT: - Index: 1
+; CHECK-NEXT: Name: call_alias
+; CHECK-NEXT: - Index: 2
; CHECK-NEXT: Name: foo
; CHECK-NEXT: - Type: CUSTOM
; CHECK-NEXT: Name: linking
diff --git a/test/Object/Inputs/trivial-object-test.wasm b/test/Object/Inputs/trivial-object-test.wasm
new file mode 100644
index 0000000000000..1f3947ac472e0
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.wasm
Binary files differ
diff --git a/test/Object/Inputs/trivial.ll b/test/Object/Inputs/trivial.ll
index 37a6bc20a8c2c..528a713c7fa3c 100644
--- a/test/Object/Inputs/trivial.ll
+++ b/test/Object/Inputs/trivial.ll
@@ -1,3 +1,6 @@
+; Input used for generating checked-in binaries (trivial-object-test.*)
+; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial-object-test.wasm
+
@.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1
define i32 @main() nounwind {
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index c1f4d9e1f96ff..f1aadd5cccf59 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -2,6 +2,8 @@ RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm -a -S - \
RUN: | FileCheck %s -check-prefix COFF32
RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm -a -S - \
RUN: | FileCheck %s -check-prefix COFF64
+RUN: llvm-nm %p/Inputs/trivial-object-test.wasm \
+RUN: | FileCheck %s -check-prefix WASM
RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \
RUN: | FileCheck %s -check-prefix ELF
RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 -S \
@@ -57,6 +59,11 @@ COFF32-NEXT: U _SomeOtherFunction
COFF32-NEXT: 00000000 T _main
COFF32-NEXT: U _puts
+WASM: U SomeOtherFunction
+WASM-NEXT: 00000002 T main
+WASM-NEXT: U puts
+WASM-NEXT: 00000001 D var
+
COFF64: 00000000 d .data
COFF64-NEXT: 00000000 t .text
COFF64-NEXT: 00000000 r ??__Ex@@YAXXZ
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index b89311db60697..73d466cc4993e 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -4,8 +4,8 @@ RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-pr
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-avr | FileCheck %s --check-prefix ELF-AVR
-RUN: obj2yaml %p/Inputs/unwind-section.elf-x86-64 \
-RUN: | FileCheck %s --check-prefix ELF-X86-64-UNWIND
+RUN: obj2yaml %p/Inputs/trivial-object-test.wasm | FileCheck %s --check-prefix WASM
+RUN: obj2yaml %p/Inputs/unwind-section.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64-UNWIND
COFF-I386: header:
COFF-I386-NEXT: Machine: IMAGE_FILE_MACHINE_I386
@@ -411,13 +411,13 @@ ELF-X86-64-NEXT: - Name: SomeOtherFunction
ELF-X86-64-NEXT: - Name: puts
-ELF-AVR: FileHeader:
+ELF-AVR: FileHeader:
ELF-AVR-NEXT: Class: ELFCLASS32
ELF-AVR-NEXT: Data: ELFDATA2LSB
ELF-AVR-NEXT: Type: ET_EXEC
ELF-AVR-NEXT: Machine: EM_AVR
ELF-AVR-NEXT: Flags: [ EF_AVR_ARCH_AVR2 ]
-ELF-AVR-NEXT: Sections:
+ELF-AVR-NEXT: Sections:
ELF-AVR-NEXT: - Name: .text
ELF-AVR-NEXT: Type: SHT_PROGBITS
ELF-AVR-NEXT: Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
@@ -429,8 +429,8 @@ ELF-AVR-NEXT: Flags: [ SHF_WRITE, SHF_ALLOC ]
ELF-AVR-NEXT: Address: 0x0000000000800060
ELF-AVR-NEXT: AddressAlign: 0x0000000000000001
ELF-AVR-NEXT: Content: ''
-ELF-AVR-NEXT: Symbols:
-ELF-AVR-NEXT: Local:
+ELF-AVR-NEXT: Symbols:
+ELF-AVR-NEXT: Local:
ELF-AVR-NEXT: - Type: STT_SECTION
ELF-AVR-NEXT: Section: .text
ELF-AVR-NEXT: - Type: STT_SECTION
@@ -440,7 +440,7 @@ ELF-AVR-NEXT: - Name: a.o
ELF-AVR-NEXT: Type: STT_FILE
ELF-AVR-NEXT: - Name: main
ELF-AVR-NEXT: Section: .text
-ELF-AVR-NEXT: Global:
+ELF-AVR-NEXT: Global:
ELF-AVR-NEXT: - Name: __trampolines_start
ELF-AVR-NEXT: Section: .text
ELF-AVR-NEXT: - Name: _etext
@@ -470,6 +470,17 @@ ELF-AVR-NEXT: - Name: _end
ELF-AVR-NEXT: Section: .data
ELF-AVR-NEXT: Value: 0x0000000000800060
+WASM: --- !WASM
+WASM-NEXT: FileHeader:
+WASM-NEXT: Version: 0x00000001
+WASM: - Type: EXPORT
+WASM-NEXT: Exports:
+WASM-NEXT: - Name: main
+WASM-NEXT: Kind: FUNCTION
+WASM-NEXT: Index: 2
+WASM-NEXT: - Name: var
+WASM-NEXT: Kind: GLOBAL
+WASM-NEXT: Index: 1
ELF-X86-64-UNWIND: - Name: .eh_frame
ELF-X86-64-UNWIND-NEXT: Type: SHT_X86_64_UNWIND
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index 1e41f78ca729e..29f0019628758 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -12,6 +12,8 @@ RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-mips64el \
RUN: | FileCheck %s -check-prefix ELF-MIPS64EL
RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-mipsel \
RUN: | FileCheck %s -check-prefix ELF-MIPSEL
+RUN: llvm-objdump -r %p/Inputs/trivial-object-test.wasm \
+RUN: | FileCheck %s -check-prefix WASM
RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \
RUN: | FileCheck %s -check-prefix ELF-complex-x86-64
@@ -57,6 +59,11 @@ ELF-MIPSEL: R_MIPS_LO16 $.str
ELF-MIPSEL: R_MIPS_CALL16 puts
ELF-MIPSEL: R_MIPS_CALL16 SomeOtherFunction
+WASM: CODE
+WASM-NEXT: R_WEBASSEMBLY_GLOBAL_ADDR_SLEB 0+0
+WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB 0+0
+WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB 1+0
+
ELF-complex-x86-64: .text
ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4
ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4
diff --git a/test/ObjectYAML/wasm/data_section.yaml b/test/ObjectYAML/wasm/data_section.yaml
index b8c65abbff912..521aa54027841 100644
--- a/test/ObjectYAML/wasm/data_section.yaml
+++ b/test/ObjectYAML/wasm/data_section.yaml
@@ -8,7 +8,7 @@ Sections:
- Initial: 0x00000003
- Type: DATA
Segments:
- - Index: 0
+ - MemoryIndex: 0
Offset:
Opcode: I32_CONST
Value: 4
@@ -38,7 +38,8 @@ Sections:
# CHECK-NEXT: Offset: 0x00000006
# CHECK-NEXT: Addend: -6
# CHECK-NEXT: Segments:
-# CHECK-NEXT: - Index: 0
+# CHECK-NEXT: - SectionOffset: 6
+# CHECK-NEXT: MemoryIndex: 0
# CHECK-NEXT: Offset:
# CHECK-NEXT: Opcode: I32_CONST
# CHECK-NEXT: Value: 4
diff --git a/test/Other/2002-01-31-CallGraph.ll b/test/Other/2002-01-31-CallGraph.ll
index 0e4c877512631..d4819357ac67e 100644
--- a/test/Other/2002-01-31-CallGraph.ll
+++ b/test/Other/2002-01-31-CallGraph.ll
@@ -1,6 +1,6 @@
; Call graph construction crash: Not handling indirect calls right
;
-; RUN: opt < %s -analyze -print-callgraph >& /dev/null
+; RUN: opt < %s -analyze -print-callgraph > /dev/null 2>&1
;
%FunTy = type i32 (i32)
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index fbecb34aa4b7c..a0658c10d6098 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -26,6 +26,37 @@
; RUN: -passes='lto-pre-link<O2>' -S %s 2>&1 \
; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-peephole='no-op-function' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-PEEPHOLE
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-late-loop-optimizations='no-op-loop' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-LOOP-LATE
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-loop-optimizer-end='no-op-loop' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-LOOP-END
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-scalar-optimizer-late='no-op-function' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-SCALAR-LATE
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-cgscc-optimizer-late='no-op-cgscc' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-CGSCC-LATE
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes-ep-vectorizer-start='no-op-function' \
+; RUN: -passes='default<O3>' -S %s 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN: --check-prefix=CHECK-EP-VECTORIZER-START
+
; CHECK-O: Starting llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@@ -53,6 +84,7 @@
; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
; CHECK-O-NEXT: Starting llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -84,6 +116,7 @@
; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
+; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: TailCallElimPass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: ReassociatePass
@@ -105,8 +138,10 @@
; CHECK-O-NEXT: Starting Loop pass manager run.
; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass
+; CHECK-EP-LOOP-LATE-NEXT: Running pass: NoOpLoopPass
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
+; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
; CHECK-O-NEXT: Finished Loop pass manager run.
; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-Os-NEXT: Running pass: GVN
@@ -126,15 +161,19 @@
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: JumpThreadingPass
; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
; CHECK-O-NEXT: Running pass: DSEPass
; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-EP-SCALAR-LATE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: ADCEPass
; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass
; CHECK-O-NEXT: Finished CGSCC pass manager run.
; CHECK-O-NEXT: Finished llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}>
@@ -146,6 +185,7 @@
; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
; CHECK-O-NEXT: Starting llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: Float2IntPass
+; CHECK-EP-VECTORIZER-START-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass
; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: LoopVectorizePass
diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll
index dfd2983532729..cab3965bf18fd 100644
--- a/test/Other/new-pm-lto-defaults.ll
+++ b/test/Other/new-pm-lto-defaults.ll
@@ -17,6 +17,10 @@
; RUN: opt -disable-verify -debug-pass-manager \
; RUN: -passes='lto<Oz>' -S %s 2>&1 \
; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN: -passes='lto<O3>' -S %s -passes-ep-peephole='no-op-function' 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \
+; RUN: --check-prefix=CHECK-EP-Peephole
; CHECK-O: Starting llvm::Module pass manager run.
; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module
@@ -45,13 +49,18 @@
; CHECK-O2-NEXT: Running analysis: AssumptionAnalysis
; CHECK-O2-NEXT: Running pass: ConstantMergePass
; CHECK-O2-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}InstCombinePass>
+; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O2-NEXT: Running pass: InstCombinePass
+; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
+; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>
; CHECK-O2-NEXT: Running pass: GlobalOptPass
; CHECK-O2-NEXT: Running pass: GlobalDCEPass
; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
; CHECK-O2-NEXT: Running pass: InstCombinePass
+; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
; CHECK-O2-NEXT: Running pass: JumpThreadingPass
; CHECK-O2-NEXT: Running analysis: LazyValueAnalysis
; CHECK-O2-NEXT: Running pass: SROA on foo
diff --git a/test/Other/pass-pipelines.ll b/test/Other/pass-pipelines.ll
index 971ed2c094730..d47c02ee7a469 100644
--- a/test/Other/pass-pipelines.ll
+++ b/test/Other/pass-pipelines.ll
@@ -24,7 +24,7 @@
; CHECK-O2: Dead Argument Elimination
; CHECK-O2-NEXT: FunctionPass Manager
; CHECK-O2-NOT: Manager
-; Very carefully asert the CGSCC pass pipeline as it is fragile and unusually
+; Very carefully assert the CGSCC pass pipeline as it is fragile and unusually
; susceptible to phase ordering issues.
; CHECK-O2: CallGraph Construction
; CHECK-O2-NEXT: Globals Alias Analysis
diff --git a/test/SafepointIRVerifier/basic-use-after-reloc.ll b/test/SafepointIRVerifier/basic-use-after-reloc.ll
new file mode 100644
index 0000000000000..4b0746c9f5275
--- /dev/null
+++ b/test/SafepointIRVerifier/basic-use-after-reloc.ll
@@ -0,0 +1,23 @@
+; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s
+
+; This test checks that if a value is used immediately after a
+; safepoint without using the relocated value that the verifier
+; catches this.
+
+%jObject = type { [8 x i8] }
+
+; Function Attrs: nounwind
+define %jObject addrspace(1)* @test(%jObject addrspace(1)* %arg) gc "statepoint-example" {
+bci_0:
+ %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, %jObject addrspace(1)* %arg)
+ %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 13, i32 13)
+ ret %jObject addrspace(1)* %arg
+; CHECK: Illegal use of unrelocated value found!
+; CHECK-NEXT: Def: %jObject addrspace(1)* %arg
+; CHECK-NEXT: Use: ret %jObject addrspace(1)* %arg
+}
+
+; Function Attrs: nounwind
+declare %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token, i32, i32) #3
+
+declare token @llvm.experimental.gc.statepoint.p0f_f64f64f(i64, i32, double (double)*, i32, i32, ...)
diff --git a/test/SafepointIRVerifier/compares.ll b/test/SafepointIRVerifier/compares.ll
new file mode 100644
index 0000000000000..a14fc44e9814c
--- /dev/null
+++ b/test/SafepointIRVerifier/compares.ll
@@ -0,0 +1,85 @@
+; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s
+
+; In some cases, it is valid to have unrelocated pointers used as compare
+; operands. Make sure the verifier knows to spot these exceptions.
+
+
+; comparison against null.
+define i8 addrspace(1)* @test1(i64 %arg, i8 addrspace(1)* %addr) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test1
+entry:
+ %load_addr = getelementptr i8, i8 addrspace(1)* %addr, i64 %arg
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ %cmp = icmp eq i8 addrspace(1)* %load_addr, null
+ ret i8 addrspace(1)* null
+}
+
+; comparison against exclusively derived null.
+define void @test2(i64 %arg, i1 %cond, i8 addrspace(1)* %addr) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test2
+ %load_addr = getelementptr i8, i8 addrspace(1)* null, i64 %arg
+ %load_addr_sel = select i1 %cond, i8 addrspace(1)* null, i8 addrspace(1)* %load_addr
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ %cmp = icmp eq i8 addrspace(1)* %addr, %load_addr_sel
+ ret void
+}
+
+; comparison against a constant non-null pointer. This is unrelocated use, since
+; that pointer bits may mean something in a VM.
+define void @test3(i64 %arg, i32 addrspace(1)* %addr) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test3
+; CHECK: Illegal use of unrelocated value found!
+entry:
+ %load_addr = getelementptr i32, i32 addrspace(1)* %addr, i64 %arg
+ %load_addr_const = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ %cmp = icmp eq i32 addrspace(1)* %load_addr, %load_addr_const
+ ret void
+}
+
+; comparison against a derived pointer that is *not* exclusively derived from
+; null. An unrelocated use since the derived pointer could be from the constant
+; non-null pointer (load_addr.2).
+define void @test4(i64 %arg, i1 %cond, i8 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test4
+; CHECK: Illegal use of unrelocated value found!
+entry:
+ %load_addr.1 = getelementptr i8, i8 addrspace(1)* null, i64 %arg
+ br i1 %cond, label %split, label %join
+
+split:
+ %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg
+ br label %join
+
+join:
+ %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ %cmp = icmp eq i8 addrspace(1)* %load_addr, %base
+ ret void
+}
+
+; comparison between 2 unrelocated base pointers.
+; Since the cmp can be reordered legally before the safepoint, these are correct
+; unrelocated uses of the pointers.
+define void @test5(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test5
+ %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg
+ %load_addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 %arg
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ %cmp = icmp eq i8 addrspace(1)* %load_addr1, %load_addr2
+ ret void
+}
+
+; comparison between a relocated and an unrelocated pointer.
+; this is invalid use of the unrelocated pointer.
+define void @test6(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test6
+; CHECK: Illegal use of unrelocated value found!
+ %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0)
+ %ptr2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2
+ %cmp = icmp eq i8 addrspace(1)* %load_addr1, %ptr2.relocated
+ ret void
+}
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
diff --git a/test/SafepointIRVerifier/constant-bases.ll b/test/SafepointIRVerifier/constant-bases.ll
new file mode 100644
index 0000000000000..52a2a46d068d0
--- /dev/null
+++ b/test/SafepointIRVerifier/constant-bases.ll
@@ -0,0 +1,70 @@
+; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s
+
+define i8 addrspace(1)* @test1(i64 %arg) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test1
+entry:
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* null
+}
+
+define i8 addrspace(1)* @test2(i64 %arg) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test2
+entry:
+ %load_addr = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* %load_addr
+}
+
+define i8 addrspace(1)* @test3(i64 %arg) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test3
+entry:
+ %load_addr = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg
+ %load_addr.cast = bitcast i32 addrspace(1)* %load_addr to i8 addrspace(1)*
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* %load_addr.cast
+}
+
+define i8 addrspace(1)* @test4(i64 %arg, i1 %cond) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test4
+entry:
+ %load_addr.1 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
+ br i1 %cond, label %split, label %join
+
+split:
+ %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg
+ br label %join
+
+join:
+ %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* %load_addr
+}
+
+define i8 addrspace(1)* @test5(i64 %arg, i1 %cond) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test5
+entry:
+ %load_addr.1 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
+ %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg
+ %load_addr = select i1 %cond, i8 addrspace(1)* %load_addr.1, i8 addrspace(1)* %load_addr.2
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* %load_addr
+}
+
+define i8 addrspace(1)* @test6(i64 %arg, i1 %cond, i8 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test6
+; CHECK: Illegal use of unrelocated value found!
+entry:
+ %load_addr.1 = getelementptr i8, i8 addrspace(1)* %base, i64 %arg
+ br i1 %cond, label %split, label %join
+
+split:
+ %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg
+ br label %join
+
+join:
+ %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ ret i8 addrspace(1)* %load_addr
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
diff --git a/test/SafepointIRVerifier/unrecorded-live-at-sp.ll b/test/SafepointIRVerifier/unrecorded-live-at-sp.ll
new file mode 100644
index 0000000000000..e3f21c3e7133a
--- /dev/null
+++ b/test/SafepointIRVerifier/unrecorded-live-at-sp.ll
@@ -0,0 +1,71 @@
+; RUN: opt %s -safepoint-ir-verifier-print-only -verify-safepoint-ir -S 2>&1 | FileCheck %s
+
+; CHECK: Illegal use of unrelocated value found!
+; CHECK-NEXT: Def: %base_phi3 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi2, %bci_37-aload ], !is_base_value !0
+; CHECK-NEXT: Use: %base_phi2 = phi %jObject addrspace(1)* [ %base_phi3, %not_zero179 ], [ %cast5, %bci_0 ], !is_base_value !0
+
+%jObject = type { [8 x i8] }
+
+declare %jObject addrspace(1)* @generate_obj1() #1
+
+declare %jObject addrspace(1)* addrspace(1)* @generate_obj2() #1
+
+declare %jObject addrspace(1)* @generate_obj3() #1
+
+; Function Attrs: nounwind
+define void @test(%jObject addrspace(1)*, %jObject addrspace(1)*, i32) #3 gc "statepoint-example" {
+bci_0:
+ %result608 = call %jObject addrspace(1)* @generate_obj3()
+ %obj609 = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)*
+ %cast = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)*
+ %cast5 = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)*
+ br label %bci_37-aload
+
+bci_37-aload: ; preds = %not_zero179, %bci_0
+ %base_phi = phi %jObject addrspace(1)* [ %base_phi1.relocated, %not_zero179 ], [ %cast, %bci_0 ], !is_base_value !0
+ %base_phi2 = phi %jObject addrspace(1)* [ %base_phi3, %not_zero179 ], [ %cast5, %bci_0 ], !is_base_value !0
+ %relocated8 = phi %jObject addrspace(1)* [ %relocated7.relocated, %not_zero179 ], [ %obj609, %bci_0 ]
+ %tmp3 = getelementptr inbounds %jObject, %jObject addrspace(1)* %relocated8, i64 0, i32 0, i64 32
+ %addr98 = bitcast i8 addrspace(1)* %tmp3 to %jObject addrspace(1)* addrspace(1)*
+ %cast6 = bitcast %jObject addrspace(1)* %base_phi2 to %jObject addrspace(1)* addrspace(1)*
+ br i1 undef, label %not_zero179, label %not_zero146
+
+not_zero146: ; preds = %bci_37-aload
+ %addr98.relocated = call %jObject addrspace(1)* addrspace(1)* @generate_obj2() #1
+ %obj609.relocated = call %jObject addrspace(1)* @generate_obj1() #1
+ br label %not_zero179
+
+not_zero179: ; preds = %not_zero146, %bci_37-aload
+ %base_phi1 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi, %bci_37-aload ], !is_base_value !0
+ %base_phi3 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi2, %bci_37-aload ], !is_base_value !0
+ %relocated7 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %relocated8, %bci_37-aload ]
+ %base_phi4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %cast6, %bci_37-aload ], !is_base_value !0
+ %relocated4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %addr98, %bci_37-aload ]
+ %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7)
+ %tmp4 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+ %base_phi1.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 12)
+ %base_phi4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 13)
+ %relocated4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 14)
+ %relocated7.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 15)
+ %addr636 = bitcast %jObject addrspace(1)* addrspace(1)* %relocated4.relocated to %jObject addrspace(1)* addrspace(1)*
+ br label %bci_37-aload
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...)
+
+; Function Attrs: nounwind
+declare i32 @llvm.experimental.gc.result.i32(token) #4
+
+; Function Attrs: nounwind
+declare %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token, i32, i32) #4
+
+; Function Attrs: nounwind
+declare %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token, i32, i32) #4
+
+attributes #0 = { noinline nounwind "gc-leaf-function"="true" }
+attributes #1 = { "gc-leaf-function"="true" }
+attributes #2 = { nounwind readonly "gc-leaf-function"="true" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind }
+
+!0 = !{i32 1}
diff --git a/test/SafepointIRVerifier/uses-in-phi-nodes.ll b/test/SafepointIRVerifier/uses-in-phi-nodes.ll
new file mode 100644
index 0000000000000..d06eb6e0d9a7c
--- /dev/null
+++ b/test/SafepointIRVerifier/uses-in-phi-nodes.ll
@@ -0,0 +1,78 @@
+; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s
+
+define i8 addrspace(1)* @test.not.ok.0(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test.not.ok.0
+ bci_0:
+ br i1 undef, label %left, label %right
+
+ left:
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ br label %merge
+
+ right:
+ br label %merge
+
+ merge:
+; CHECK: Illegal use of unrelocated value found!
+; CHECK-NEXT: Def: i8 addrspace(1)* %arg
+; CHECK-NEXT: Use: %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right ]
+ %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right]
+ ret i8 addrspace(1)* %val
+}
+
+define i8 addrspace(1)* @test.not.ok.1(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: Verifying gc pointers in function: test.not.ok.1
+ bci_0:
+ br i1 undef, label %left, label %right
+
+ left:
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ br label %merge
+
+ right:
+ br label %merge
+
+ merge:
+; CHECK: Illegal use of unrelocated value found!
+; CHECK-NEXT: Def: i8 addrspace(1)* %arg
+; CHECK-NEXT: Use: %val = phi i8 addrspace(1)* [ %arg, %left ], [ null, %right ]
+ %val = phi i8 addrspace(1)* [ %arg, %left ], [ null, %right]
+ ret i8 addrspace(1)* %val
+}
+
+define i8 addrspace(1)* @test.ok.0(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test.ok.0
+ bci_0:
+ br i1 undef, label %left, label %right
+
+ left:
+ %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+ br label %merge
+
+ right:
+ br label %merge
+
+ merge:
+ %val = phi i8 addrspace(1)* [ null, %left ], [ null, %right]
+ ret i8 addrspace(1)* %val
+}
+
+define i8 addrspace(1)* @test.ok.1(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK: No illegal uses found by SafepointIRVerifier in: test.ok.1
+ bci_0:
+ br i1 undef, label %left, label %right
+
+ left:
+ call void @not_statepoint()
+ br label %merge
+
+ right:
+ br label %merge
+
+ merge:
+ %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right]
+ ret i8 addrspace(1)* %val
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare void @not_statepoint()
diff --git a/test/TableGen/AsmVariant.td b/test/TableGen/AsmVariant.td
index cb5d32385d3b3..70d410ee7bd06 100644
--- a/test/TableGen/AsmVariant.td
+++ b/test/TableGen/AsmVariant.td
@@ -31,6 +31,7 @@ def foo : Instruction {
let InOperandList = (ins);
let AsmString = "foo";
let AsmVariantName = "Foo";
+ let Namespace = "Arch";
}
def BarAlias : InstAlias<"bar", (foo)> {
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index 7c09b97a5e998..114d0e23b855b 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -7,6 +7,10 @@ include "llvm/Target/Target.td"
def MyTargetISA : InstrInfo;
def MyTarget : Target { let InstructionSet = MyTargetISA; }
+let TargetPrefix = "mytarget" in {
+def int_mytarget_nop : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+}
+
def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
def GPR32Op : RegisterOperand<GPR32>;
@@ -38,6 +42,23 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
//===- Test the function boilerplate. -------------------------------------===//
+// CHECK: const unsigned MAX_SUBTARGET_PREDICATES = 3;
+// CHECK: using PredicateBitset = llvm::PredicateBitsetImpl<MAX_SUBTARGET_PREDICATES>;
+
+// CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_DECL
+// CHECK-NEXT: mutable MatcherState State;
+// CHECK-NEXT: typedef ComplexRendererFn(MyTargetInstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const;
+// CHECK-NEXT: const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> MatcherInfo;
+// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL
+
+// CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT
+// CHECK-NEXT: , State(2),
+// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, {
+// CHECK-NEXT: nullptr, // GICP_Invalid
+// CHECK-NEXT: &MyTargetInstructionSelector::selectComplexPattern, // gi_complex
+// CHECK-NEXT: }})
+// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT
+
// CHECK-LABEL: enum SubtargetFeatureBits : uint8_t {
// CHECK-NEXT: Feature_HasABit = 0,
// CHECK-NEXT: Feature_HasBBit = 1,
@@ -63,39 +84,104 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
// CHECK-NEXT: }
// CHECK: bool MyTargetInstructionSelector::selectImpl(MachineInstr &I) const {
-// CHECK: MachineFunction &MF = *I.getParent()->getParent();
-// CHECK: const MachineRegisterInfo &MRI = MF.getRegInfo();
+// CHECK-NEXT: MachineFunction &MF = *I.getParent()->getParent();
+// CHECK-NEXT: MachineRegisterInfo &MRI = MF.getRegInfo();
+// CHECK: AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, &MF);
+// CHECK-NEXT: const PredicateBitset AvailableFeatures = getAvailableFeatures();
+// CHECK-NEXT: NewMIVector OutMIs;
+// CHECK-NEXT: State.MIs.clear();
+// CHECK-NEXT: State.MIs.push_back(&I);
+
+//===- Test a pattern with multiple ComplexPatterns in multiple instrs ----===//
+//
+
+// CHECK-LABEL: MatchTable0[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SELECT,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex,
+// CHECK-NEXT: // MIs[0] src3
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/3, /*Renderer*/1, GICP_gi_complex,
+// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, complex:i32:$src3) => (INSN2:i32 GPR32:i32:$src1, complex:i32:$src3, complex:i32:$src2)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN2,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/1,
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0,
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable0\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable0, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
+
+def INSN3 : I<(outs GPR32:$dst),
+ (ins GPR32Op:$src1, complex:$src2, GPR32:$src3, complex:$src4, complex:$src5), []>;
+def : Pat<(select GPR32:$src1, complex:$src2, (select GPR32:$src3, complex:$src4, complex:$src5)),
+ (INSN3 GPR32:$src1, complex:$src2, GPR32:$src3, complex:$src4, complex:$src5)>;
//===- Test a pattern with multiple ComplexPattern operands. --------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 4)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_SELECT) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((Renderer0 = selectComplexPattern(MI0.getOperand(2)))))) &&
-// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(3).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((Renderer1 = selectComplexPattern(MI0.getOperand(3))))))) {
-// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, complex:i32:$src3) => (INSN2:i32 GPR32:i32:$src1, complex:i32:$src3, complex:i32:$src2)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN2));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: Renderer1(MIB);
-// CHECK-NEXT: Renderer0(MIB);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
+// CHECK-LABEL: MatchTable1[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/4,
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/3, // MIs[1]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/4,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SELECT,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex,
+// CHECK-NEXT: // MIs[0] Operand 3
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_SELECT,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src3
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[1] src4
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/2, /*Renderer*/1, GICP_gi_complex,
+// CHECK-NEXT: // MIs[1] src5
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/3, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/3, /*Renderer*/2, GICP_gi_complex,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, (select:i32 GPR32:i32:$src3, complex:i32:$src4, complex:i32:$src5)) => (INSN3:i32 GPR32:i32:$src1, complex:i32:$src2, GPR32:i32:$src3, complex:i32:$src4, complex:i32:$src5)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN3,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src3
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/1,
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/2,
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable1\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable1, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def : GINodeEquiv<G_SELECT, select>;
def INSN2 : I<(outs GPR32:$dst), (ins GPR32Op:$src1, complex:$src2, complex:$src3), []>;
@@ -104,119 +190,149 @@ def : Pat<(select GPR32:$src1, complex:$src2, complex:$src3),
//===- Test a simple pattern with regclass operands. ----------------------===//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_ADD) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
-
-// CHECK-NEXT: // (add:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (ADD:i32 GPR32:i32:$src1, GPR32:i32:$src2)
-// CHECK-NEXT: I.setDesc(TII.get(MyTarget::ADD));
-// CHECK-NEXT: MachineInstr &NewI = I;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
-
+// CHECK-LABEL: MatchTable2[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ADD,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // (add:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (ADD:i32 GPR32:i32:$src1, GPR32:i32:$src2)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/MyTarget::ADD,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable2\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable2, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
[(set GPR32:$dst, (add GPR32:$src1, GPR32:$src2))]>;
+//===- Test a simple pattern with an intrinsic. ---------------------------===//
+//
+
+// CHECK-LABEL: MatchTable3[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_INTRINSIC,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 1
+// CHECK-NEXT: GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, Intrinsic::mytarget_nop,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // (intrinsic_wo_chain:i32 [[ID:[0-9]+]]:iPTR, GPR32:i32:$src1) => (MOV:i32 GPR32:i32:$src1)
+
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MOV,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable3\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable3, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
+
+def MOV : I<(outs GPR32:$dst), (ins GPR32:$src1),
+ [(set GPR32:$dst, (int_mytarget_nop GPR32:$src1))]>;
+
//===- Test a nested instruction match. -----------------------------------===//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit};
-// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if (!MI0.getOperand(1).isReg())
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(1).getReg()))
-// CHECK-NEXT: return false;
-// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(1).getReg());
-// CHECK-NEXT: if (MI1.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (((MI1.getOpcode() == TargetOpcode::G_ADD) &&
-// CHECK-NEXT: ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI))))))
-// CHECK-NEXT: ))) &&
-// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
-// CHECK-NEXT: if (!isObviouslySafeToFold(MI1)) return false;
-// CHECK-NEXT: // (mul:i32 (add:i32 GPR32:i32:$src1, GPR32:i32:$src2), GPR32:i32:$src3) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.add(MI1.getOperand(1)/*src1*/);
-// CHECK-NEXT: MIB.add(MI1.getOperand(2)/*src2*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(2)/*src3*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, &MI1, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
+// CHECK-LABEL: MatchTable4[] = {
+// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA,
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ADD,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[1] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src3
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: // (mul:i32 (add:i32 GPR32:i32:$src1, GPR32:i32:$src2), GPR32:i32:$src3) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MULADD,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src3
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable4\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable4, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
// We also get a second rule by commutativity.
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit};
-// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if (!MI0.getOperand(2).isReg())
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(2).getReg()))
-// CHECK-NEXT: return false;
-// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(2).getReg());
-// CHECK-NEXT: if (MI1.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (((MI1.getOpcode() == TargetOpcode::G_ADD) &&
-// CHECK-NEXT: ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI))))))
-// CHECK-NEXT: )))) {
-// CHECK-NEXT: if (!isObviouslySafeToFold(MI1)) return false;
-// CHECK-NEXT: // (mul:i32 GPR32:i32:$src3, (add:i32 GPR32:i32:$src1, GPR32:i32:$src2)) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.add(MI1.getOperand(1)/*src1*/);
-// CHECK-NEXT: MIB.add(MI1.getOperand(2)/*src2*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src3*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, &MI1, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
+// CHECK-LABEL: MatchTable5[] = {
+// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA,
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2,
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src3
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ADD,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[1] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: // (mul:i32 GPR32:i32:$src3, (add:i32 GPR32:i32:$src1, GPR32:i32:$src2)) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MULADD,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src3
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable5\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable5, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3),
[(set GPR32:$dst,
@@ -225,67 +341,129 @@ def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3),
//===- Test another simple pattern with regclass operands. ----------------===//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit, Feature_HasBBit, Feature_HasCBit};
-// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
-// CHECK-NEXT: // (mul:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (MUL:i32 GPR32:i32:$src2, GPR32:i32:$src1)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MUL));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(2)/*src2*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable6[] = {
+// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA_HasB_HasC,
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // (mul:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (MUL:i32 GPR32:i32:$src2, GPR32:i32:$src1)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MUL,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src2
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable6\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable6, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def MUL : I<(outs GPR32:$dst), (ins GPR32:$src2, GPR32:$src1),
[(set GPR32:$dst, (mul GPR32:$src1, GPR32:$src2))]>,
Requires<[HasA, HasB, HasC]>;
+//===- Test a more complex multi-instruction match. -----------------------===//
+
+// CHECK-LABEL: MatchTable7[] = {
+// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA,
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/2, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_SUB,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[1] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_SUB,
+// CHECK-NEXT: // MIs[2] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[2] src3
+// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/2, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[2] src4
+// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/2, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/2,
+// CHECK-NEXT: // (sub:i32 (sub:i32 GPR32:i32:$src1, GPR32:i32:$src2), (sub:i32 GPR32:i32:$src3, GPR32:i32:$src4)) => (INSNBOB:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3, GPR32:i32:$src4)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSNBOB,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/2, /*OpIdx*/1, // src3
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/2, /*OpIdx*/2, // src4
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable7\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable7, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
+
+def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
+ [(set GPR32:$dst,
+ (sub (sub GPR32:$src1, GPR32:$src2), (sub GPR32:$src3, GPR32:$src4)))]>,
+ Requires<[HasA]>;
+
//===- Test a pattern with ComplexPattern operands. -----------------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_SUB) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((Renderer0 = selectComplexPattern(MI0.getOperand(2))))))) {
-// CHECK-NEXT: // (sub:i32 GPR32:i32:$src1, complex:i32:$src2) => (INSN1:i32 GPR32:i32:$src1, complex:i32:$src2)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN1));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: Renderer0(MIB);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
+// CHECK-LABEL: MatchTable8[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex,
+// CHECK-NEXT: // (sub:i32 GPR32:i32:$src1, complex:i32:$src2) => (INSN1:i32 GPR32:i32:$src1, complex:i32:$src2)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN1,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0,
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable8\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable8, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def INSN1 : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2), []>;
def : Pat<(sub GPR32:$src1, complex:$src2), (INSN1 GPR32:$src1, complex:$src2)>;
@@ -293,32 +471,33 @@ def : Pat<(sub GPR32:$src1, complex:$src2), (INSN1 GPR32:$src1, complex:$src2)>;
//===- Test a simple pattern with a default operand. ----------------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -2, MRI))))) {
-// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -2:i32) => (XORI:i32 GPR32:i32:$src1)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORI));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.addImm(-1);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable9[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -2
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -2:i32) => (XORI:i32 GPR32:i32:$src1)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORI,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable9\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable9, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
// The -2 is just to distinguish it from the 'not' case below.
def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1),
@@ -327,32 +506,33 @@ def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1),
//===- Test a simple pattern with a default register operand. -------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -3, MRI))))) {
-// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -3:i32) => (XOR:i32 GPR32:i32:$src1)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XOR));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.addReg(MyTarget::R0);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable10[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -3
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -3:i32) => (XOR:i32 GPR32:i32:$src1)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XOR,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable10\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable10, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
// The -3 is just to distinguish it from the 'not' case below and the other default op case above.
def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1),
@@ -361,33 +541,34 @@ def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1),
//===- Test a simple pattern with a multiple default operands. ------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -4, MRI))))) {
-// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -4:i32) => (XORlike:i32 GPR32:i32:$src1)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORlike));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.addImm(-1);
-// CHECK-NEXT: MIB.addReg(MyTarget::R0);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable11[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -4
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -4:i32) => (XORlike:i32 GPR32:i32:$src1)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORlike,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1,
+// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable11\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable11, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
// The -4 is just to distinguish it from the other 'not' cases.
def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1),
@@ -396,34 +577,35 @@ def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1),
//===- Test a simple pattern with multiple operands with defaults. --------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -5, MRI))))) {
-// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -5:i32) => (XORManyDefaults:i32 GPR32:i32:$src1)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORManyDefaults));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.addImm(-1);
-// CHECK-NEXT: MIB.addReg(MyTarget::R0);
-// CHECK-NEXT: MIB.addReg(MyTarget::R0);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable12[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -5,
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -5:i32) => (XORManyDefaults:i32 GPR32:i32:$src1)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORManyDefaults,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1,
+// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0,
+// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable12\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable12, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
// The -5 is just to distinguish it from the other cases.
def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1),
@@ -434,32 +616,33 @@ def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1)
// This must precede the 3-register variants because constant immediates have
// priority over register banks.
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 3)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Wm */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -1, MRI))))) {
-// CHECK-NEXT: // (xor:i32 GPR32:i32:$Wm, -1:i32) => (ORN:i32 R0:i32, GPR32:i32:$Wm)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::ORN));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: MIB.addReg(MyTarget::R0);
-// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*Wm*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable13[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Wm
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -1,
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$Wm, -1:i32) => (ORN:i32 R0:i32, GPR32:i32:$Wm)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::ORN,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // Wm
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable13\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable13, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def ORN : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
@@ -467,70 +650,72 @@ def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
//===- Test a COPY_TO_REGCLASS --------------------------------------------===//
//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 2)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_BITCAST) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::FPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))))
-// CHECK-NEXT: // (bitconvert:i32 FPR32:f32:$src1) => (COPY_TO_REGCLASS:i32 FPR32:f32:$src1, GPR32:i32)
-// CHECK-NEXT: I.setDesc(TII.get(TargetOpcode::COPY));
-// CHECK-NEXT: MachineInstr &NewI = I;
-// CHECK-NEXT: constrainOperandRegToRegClass(NewI, 0, MyTarget::GPR32RegClass, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable14[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_BITCAST,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::FPR32RegClassID,
+// CHECK-NEXT: // (bitconvert:i32 FPR32:f32:$src1) => (COPY_TO_REGCLASS:i32 FPR32:f32:$src1, GPR32:i32)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/TargetOpcode::COPY,
+// CHECK-NEXT: GIR_ConstrainOperandRC, /*InsnID*/0, /*Op*/0, /*RC GPR32*/ 1,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable14\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable14, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def : Pat<(i32 (bitconvert FPR32:$src1)),
(COPY_TO_REGCLASS FPR32:$src1, GPR32)>;
//===- Test a simple pattern with just a leaf immediate. ------------------===//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 2)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_CONSTANT) &&
-// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT: ((/* Operand 1 */ (MI0.getOperand(1).isCImm() && MI0.getOperand(1).getCImm()->equalsInt(1))))) {
-// CHECK-NEXT: // 1:i32 => (MOV1:i32)
-// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV1));
-// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT: MIB.addMemOperand(MMO);
-// CHECK-NEXT: I.eraseFromParent();
-// CHECK-NEXT: MachineInstr &NewI = *MIB;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable15[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_CONSTANT,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
+// CHECK-NEXT: // MIs[0] Operand 1
+// CHECK-NEXT: GIM_CheckLiteralInt, /*MI*/0, /*Op*/1, 1,
+// CHECK-NEXT: // 1:i32 => (MOV1:i32)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MOV1,
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable15\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable15, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def MOV1 : I<(outs GPR32:$dst), (ins), [(set GPR32:$dst, 1)]>;
//===- Test a pattern with an MBB operand. --------------------------------===//
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT: MachineInstr &MI0 = I;
-// CHECK-NEXT: if (MI0.getNumOperands() < 1)
-// CHECK-NEXT: return false;
-// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_BR) &&
-// CHECK-NEXT: ((/* target */ (MI0.getOperand(0).isMBB())))) {
-
-// CHECK-NEXT: // (br (bb:Other):$target) => (BR (bb:Other):$target)
-// CHECK-NEXT: I.setDesc(TII.get(MyTarget::BR));
-// CHECK-NEXT: MachineInstr &NewI = I;
-// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT: return true;
-// CHECK-NEXT: }
-// CHECK-NEXT: return false;
-// CHECK-NEXT: }()) { return true; }
+// CHECK-LABEL: MatchTable16[] = {
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/1,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_BR,
+// CHECK-NEXT: // MIs[0] target
+// CHECK-NEXT: GIM_CheckIsMBB, /*MI*/0, /*Op*/0,
+// CHECK-NEXT: // (br (bb:Other):$target) => (BR (bb:Other):$target)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/MyTarget::BR,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_Done,
+// CHECK-NEXT: };
+// CHECK-NEXT: MIs.resize(1);
+// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable16\n");
+// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable16, TII, MRI, TRI, RBI, AvailableFeatures)) {
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
def BR : I<(outs), (ins unknown:$target),
[(br bb:$target)]>;
diff --git a/test/TableGen/UnterminatedComment.td b/test/TableGen/UnterminatedComment.td
index f92525a991644..f386e4cef83be 100644
--- a/test/TableGen/UnterminatedComment.td
+++ b/test/TableGen/UnterminatedComment.td
@@ -1,4 +1,4 @@
-// RUN: not llvm-tblgen < %s >& /dev/null
+// RUN: not llvm-tblgen < %s > /dev/null 2>&1
def x;
diff --git a/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
new file mode 100644
index 0000000000000..7ce8ab3ac5215
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -0,0 +1,38 @@
+; RUN: opt -argpromotion -verify -dse -S %s -o - | FileCheck %s
+
+; Fix for PR33641. ArgumentPromotion removed the argument to bar but left the call to
+; dbg.value which still used the removed argument.
+
+%p_t = type i16*
+%fun_t = type void (%p_t)*
+
+define void @foo() {
+ %tmp = alloca %fun_t
+ store %fun_t @bar, %fun_t* %tmp
+ ret void
+}
+
+define internal void @bar(%p_t %p) {
+ call void @llvm.dbg.value(metadata %p_t %p, i64 0, metadata !4, metadata !5), !dbg !6
+ ret void
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "bar", unit: !0)
+!4 = !DILocalVariable(name: "p", scope: !3)
+!5 = !DIExpression()
+!6 = !DILocation(line: 1, column: 1, scope: !3)
+
+; The %p argument should be removed, and the use of it in dbg.value should be
+; changed to undef.
+; CHECK: define internal void @bar() {
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i16* undef
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
index 2435cd7d0a830..4b9e7c3956f58 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
@@ -5,8 +6,8 @@ declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp2(
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16*
-; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16*
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
@@ -23,7 +24,7 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp3(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
@@ -32,8 +33,8 @@ define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp4(
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32*
-; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32*
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
@@ -50,7 +51,7 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp5(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
@@ -59,7 +60,7 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp6(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
@@ -68,7 +69,7 @@ define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp7(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
@@ -77,12 +78,12 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp8(
-; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp8(
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64*
-; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
@@ -99,7 +100,7 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp9(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
@@ -108,7 +109,7 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp10(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
@@ -117,7 +118,7 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp11(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
@@ -126,7 +127,7 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp12(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
@@ -135,7 +136,7 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp13(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
@@ -144,7 +145,7 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp14(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
@@ -153,7 +154,7 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp15(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
@@ -162,7 +163,7 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp16(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
; ALL-NEXT: ret i32 [[CALL]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
@@ -171,8 +172,8 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq2(
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16*
-; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16*
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
@@ -189,7 +190,7 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -202,8 +203,8 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq4(
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32*
-; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32*
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
@@ -220,7 +221,7 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -233,7 +234,7 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -246,7 +247,7 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq7(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -259,14 +260,14 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp_eq8(
-; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq8(
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64*
-; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
@@ -283,7 +284,7 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq9(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -296,7 +297,7 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq10(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -309,7 +310,7 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq11(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -322,7 +323,7 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq12(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -335,7 +336,7 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq13(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -348,7 +349,7 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq14(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -361,7 +362,7 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq15(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
@@ -374,7 +375,7 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq16(
-; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; ALL-NEXT: ret i32 [[CONV]]
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index 9d6e668167fbb..b6b7757978263 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -4,6 +4,8 @@ target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
+@x = external global [1 x [2 x <4 x float>]]
+
; Can we sink single addressing mode computation to use?
define void @test1(i1 %cond, i64* %base) {
; CHECK-LABEL: @test1
@@ -194,3 +196,25 @@ rare.2:
declare void @slowpath(i32, i32*)
+
+; Make sure we don't end up in an infinite loop after we fail to sink.
+; CHECK-LABEL: define void @test8
+; CHECK: %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef
+define void @test8() {
+allocas:
+ %aFOO_load = load float*, float** undef
+ %aFOO_load_ptr2int = ptrtoint float* %aFOO_load to i64
+ %aFOO_load_ptr2int_broadcast_init = insertelement <4 x i64> undef, i64 %aFOO_load_ptr2int, i32 0
+ %aFOO_load_ptr2int_2void = inttoptr i64 %aFOO_load_ptr2int to i8*
+ %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef
+ br label %load.i145
+
+load.i145:
+ %ptr.i143 = bitcast i8* %ptr to <4 x float>*
+ %valall.i144 = load <4 x float>, <4 x float>* %ptr.i143, align 4
+ %x_offset = getelementptr [1 x [2 x <4 x float>]], [1 x [2 x <4 x float>]]* @x, i32 0, i64 0
+ br label %pl_loop.i.i122
+
+pl_loop.i.i122:
+ br label %pl_loop.i.i122
+}
diff --git a/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll b/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll
new file mode 100644
index 0000000000000..3808c0e61c10a
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+;
+; Ensure that we don't {crash,return a bad value} when given an alloca larger
+; than what a pointer can represent.
+
+target datalayout = "p:16:16"
+
+; CHECK-LABEL: @alloca_overflow_is_unknown(
+define i16 @alloca_overflow_is_unknown() {
+ %i = alloca i8, i32 65537
+ %j = call i16 @llvm.objectsize.i16.p0i8(i8* %i, i1 false, i1 false)
+ ; CHECK: ret i16 -1
+ ret i16 %j
+}
+
+declare i16 @llvm.objectsize.i16.p0i8(i8*, i1, i1)
diff --git a/test/Transforms/ConstantHoisting/ARM/bad-cases.ll b/test/Transforms/ConstantHoisting/ARM/bad-cases.ll
index ffcfb2e56c95d..315e69998c627 100644
--- a/test/Transforms/ConstantHoisting/ARM/bad-cases.ll
+++ b/test/Transforms/ConstantHoisting/ARM/bad-cases.ll
@@ -107,3 +107,34 @@ entry:
%ret = add i32 %cast0, %cast1
ret i32 %ret
}
+
+@exception_type = external global i8
+
+; Constants in inline ASM should not be hoisted.
+define i32 @inline_asm_invoke() personality i8* null {
+;CHECK-LABEL: @inline_asm_invoke
+;CHECK-NOT: %const = 214672
+;CHECK: %X = invoke i32 asm "bswap $0", "=r,r"(i32 214672)
+ %X = invoke i32 asm "bswap $0", "=r,r"(i32 214672)
+ to label %L unwind label %lpad
+;CHECK: %Y = invoke i32 asm "bswap $0", "=r,r"(i32 214672)
+ %Y = invoke i32 asm "bswap $0", "=r,r"(i32 214672)
+ to label %L unwind label %lpad
+L:
+ ret i32 %X
+lpad:
+ %lp = landingpad i32
+ cleanup
+ catch i8* @exception_type
+ ret i32 1
+}
+
+define i32 @inline_asm_call() {
+;CHECK-LABEL: @inline_asm_call
+;CHECK-NOT: %const = 214672
+;CHECK: %X = call i32 asm "bswap $0", "=r,r"(i32 214672)
+ %X = call i32 asm "bswap $0", "=r,r"(i32 214672)
+;CHECK: %Y = call i32 asm "bswap $0", "=r,r"(i32 214672)
+ %Y = call i32 asm "bswap $0", "=r,r"(i32 214672)
+ ret i32 %X
+}
diff --git a/test/Transforms/ConstantHoisting/ARM/insertvalue.ll b/test/Transforms/ConstantHoisting/ARM/insertvalue.ll
new file mode 100644
index 0000000000000..99fe7fbe22a56
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/ARM/insertvalue.ll
@@ -0,0 +1,31 @@
+; RUN: opt -consthoist -S < %s | FileCheck %s
+target triple = "thumbv6m-none-eabi"
+
+%T = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+i32, i32, i32, i32, i32, i32 }
+
+; The second operand of insertvalue is able to be hoisted.
+define void @test1(%T %P) {
+; CHECK-LABEL: @test1
+; CHECK: %const = bitcast i32 256 to i32
+; CHECK: %1 = insertvalue %T %P, i32 %const, 256
+; CHECK: %2 = insertvalue %T %P, i32 %const, 256
+ %1 = insertvalue %T %P, i32 256, 256
+ %2 = insertvalue %T %P, i32 256, 256
+ ret void
+}
diff --git a/test/Transforms/ConstantHoisting/X86/ehpad.ll b/test/Transforms/ConstantHoisting/X86/ehpad.ll
index 4f87572f34472..5e345c4515d71 100644
--- a/test/Transforms/ConstantHoisting/X86/ehpad.ll
+++ b/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -1,9 +1,6 @@
-; RUN: opt -S -consthoist < %s | FileCheck %s
+; RUN: opt -S -consthoist -consthoist-with-block-frequency=false < %s | FileCheck %s
; RUN: opt -S -consthoist -consthoist-with-block-frequency=true < %s | FileCheck --check-prefix=BFIHOIST %s
-; FIXME: The catchpad doesn't even use the constant, so a better fix would be to
-; insert the bitcast in the catchpad block.
-
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
diff --git a/test/Transforms/GVN/PRE/atomic.ll b/test/Transforms/GVN/PRE/atomic.ll
index 509acd613e955..3479bc9a0e33a 100644
--- a/test/Transforms/GVN/PRE/atomic.ll
+++ b/test/Transforms/GVN/PRE/atomic.ll
@@ -208,14 +208,14 @@ define void @fence_seq_cst(i32* %P1, i32* %P2) {
ret void
}
-; Can't DSE across a full singlethread fence
+; Can't DSE across a full syncscope("singlethread") fence
define void @fence_seq_cst_st(i32* %P1, i32* %P2) {
; CHECK-LABEL: @fence_seq_cst_st(
; CHECK: store
-; CHECK: fence singlethread seq_cst
+; CHECK: fence syncscope("singlethread") seq_cst
; CHECK: store
store i32 0, i32* %P1, align 4
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
store i32 0, i32* %P1, align 4
ret void
}
diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll
deleted file mode 100644
index 78681e20df5e1..0000000000000
--- a/test/Transforms/GVN/PRE/phi-translate-2.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: opt < %s -gvn -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-@a = common global [100 x i64] zeroinitializer, align 16
-@b = common global [100 x i64] zeroinitializer, align 16
-@g1 = common global i64 0, align 8
-@g2 = common global i64 0, align 8
-@g3 = common global i64 0, align 8
-declare i64 @goo(...) local_unnamed_addr #1
-
-define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
-entry:
- %mul = mul nsw i64 %b, %a
- store i64 %mul, i64* @g1, align 8
- %t0 = load i64, i64* @g2, align 8
- %cmp = icmp sgt i64 %t0, 3
- br i1 %cmp, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- %mul2 = mul nsw i64 %d, %c
- store i64 %mul2, i64* @g2, align 8
- br label %if.end
-
-; Check phi-translate works and mul is removed.
-; CHECK-LABEL: @test1(
-; CHECK: if.end:
-; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ]
-; CHECK-NOT: = mul
-; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
-if.end: ; preds = %if.then, %entry
- %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ]
- %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ]
- %mul3 = mul nsw i64 %a.addr.0, %b.addr.0
- store i64 %mul3, i64* @g3, align 8
- ret void
-}
-
-define void @test2(i64 %i) {
-entry:
- %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i
- %t0 = load i64, i64* %arrayidx, align 8
- %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i
- %t1 = load i64, i64* %arrayidx1, align 8
- %mul = mul nsw i64 %t1, %t0
- store i64 %mul, i64* @g1, align 8
- %cmp = icmp sgt i64 %mul, 3
- br i1 %cmp, label %if.then, label %if.end
-
-; Check phi-translate works for the phi generated by loadpre. A new mul will be
-; inserted in if.then block.
-; CHECK-LABEL: @test2(
-; CHECK: if.then:
-; CHECK: %[[MUL_THEN:.*]] = mul
-; CHECK: br label %if.end
-if.then: ; preds = %entry
- %call = tail call i64 (...) @goo() #2
- store i64 %call, i64* @g2, align 8
- br label %if.end
-
-; CHECK: if.end:
-; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ]
-; CHECK-NOT: = mul
-; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
-if.end: ; preds = %if.then, %entry
- %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ]
- %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0
- %t2 = load i64, i64* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0
- %t3 = load i64, i64* %arrayidx4, align 8
- %mul5 = mul nsw i64 %t3, %t2
- store i64 %mul5, i64* @g3, align 8
- ret void
-}
-
-; Check phi-translate doesn't go through backedge, which may lead to incorrect
-; pre transformation.
-; CHECK: for.end:
-; CHECK-NOT: %{{.*pre-phi}} = phi
-; CHECK: ret void
-define void @test3(i64 %N, i64* nocapture readonly %a) {
-entry:
- br label %for.cond
-
-for.cond: ; preds = %for.body, %entry
- %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ]
- %add = add nuw nsw i64 %i.0, 1
- %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add
- %tmp0 = load i64, i64* %arrayidx, align 8
- %cmp = icmp slt i64 %i.0, %N
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %call = tail call i64 (...) @goo() #2
- %add1 = sub nsw i64 0, %call
- %tobool = icmp eq i64 %tmp0, %add1
- br i1 %tobool, label %for.cond, label %for.end
-
-for.end: ; preds = %for.body, %for.cond
- %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ]
- %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa
- %tmp1 = load i64, i64* %arrayidx2, align 8
- store i64 %tmp1, i64* @g1, align 8
- ret void
-}
-
-; It is incorrect to use the value of %andres in last loop iteration
-; to do pre.
-; CHECK-LABEL: @test4(
-; CHECK: for.body:
-; CHECK-NOT: %andres.pre-phi = phi i32
-; CHECK: br i1 %tobool1
-
-define i32 @test4(i32 %cond, i32 %SectionAttrs.0231.ph, i32 *%AttrFlag) {
-for.body.preheader:
- %t514 = load volatile i32, i32* %AttrFlag
- br label %for.body
-
-for.body:
- %t320 = phi i32 [ %t334, %bb343 ], [ %t514, %for.body.preheader ]
- %andres = and i32 %t320, %SectionAttrs.0231.ph
- %tobool1 = icmp eq i32 %andres, 0
- br i1 %tobool1, label %bb343, label %critedge.loopexit
-
-bb343:
- %t334 = load volatile i32, i32* %AttrFlag
- %tobool2 = icmp eq i32 %cond, 0
- br i1 %tobool2, label %critedge.loopexit, label %for.body
-
-critedge.loopexit:
- unreachable
-}
diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll
index 1b2b4d20d31da..9eec8bb6455b4 100644
--- a/test/Transforms/GVN/PRE/pre-gep-load.ll
+++ b/test/Transforms/GVN/PRE/pre-gep-load.ll
@@ -37,7 +37,7 @@ sw.bb2: ; preds = %if.end, %entry
%3 = load double, double* %arrayidx5, align 8
; CHECK: sw.bb2:
; CHECK-NOT: sext
-; CHECK: phi double [
+; CHECK-NEXT: phi double [
; CHECK-NOT: load
%sub6 = fsub double 3.000000e+00, %3
br label %return
diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll
index ffff2b7f08e53..685df24f62b65 100644
--- a/test/Transforms/GVN/PRE/pre-load.ll
+++ b/test/Transforms/GVN/PRE/pre-load.ll
@@ -72,7 +72,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
@@ -104,7 +104,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
@@ -263,7 +263,7 @@ block4:
%PRE = load i32, i32* %P3
ret i32 %PRE
; CHECK: block4:
-; CHECK: phi i32 [
+; CHECK-NEXT: phi i32 [
; CHECK-NOT: load
; CHECK: ret i32
}
diff --git a/test/Transforms/IndVarSimplify/canonicalize-cmp.ll b/test/Transforms/IndVarSimplify/canonicalize-cmp.ll
new file mode 100644
index 0000000000000..2b939767284a4
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/canonicalize-cmp.ll
@@ -0,0 +1,98 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; Check that we replace signed comparisons between non-negative values with
+; unsigned comparisons if we can.
+
+target datalayout = "n8:16:32:64"
+
+define i32 @test_01(i32 %a, i32 %b, i32* %p) {
+
+; CHECK-LABEL: @test_01(
+; CHECK-NOT: icmp slt
+; CHECK: %cmp1 = icmp ult i32 %iv, 100
+; CHECK: %cmp2 = icmp ult i32 %iv, 100
+; CHECK-NOT: %cmp3
+; CHECK: %exitcond = icmp ne i32 %iv.next, 1000
+
+entry:
+ br label %loop.entry
+
+loop.entry:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.be ]
+ %cmp1 = icmp slt i32 %iv, 100
+ br i1 %cmp1, label %b1, label %b2
+
+b1:
+ store i32 %iv, i32* %p
+ br label %merge
+
+b2:
+ store i32 %a, i32* %p
+ br label %merge
+
+merge:
+ %cmp2 = icmp ult i32 %iv, 100
+ br i1 %cmp2, label %b3, label %b4
+
+b3:
+ store i32 %iv, i32* %p
+ br label %loop.be
+
+b4:
+ store i32 %b, i32* %p
+ br label %loop.be
+
+loop.be:
+ %iv.next = add i32 %iv, 1
+ %cmp3 = icmp slt i32 %iv.next, 1000
+ br i1 %cmp3, label %loop.entry, label %exit
+
+exit:
+ ret i32 %iv
+}
+
+define i32 @test_02(i32 %a, i32 %b, i32* %p) {
+
+; CHECK-LABEL: @test_02(
+; CHECK-NOT: icmp sgt
+; CHECK: %cmp1 = icmp ugt i32 100, %iv
+; CHECK: %cmp2 = icmp ugt i32 100, %iv
+; CHECK-NOT: %cmp3
+; CHECK: %exitcond = icmp ne i32 %iv.next, 1000
+
+entry:
+ br label %loop.entry
+
+loop.entry:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.be ]
+ %cmp1 = icmp sgt i32 100, %iv
+ br i1 %cmp1, label %b1, label %b2
+
+b1:
+ store i32 %iv, i32* %p
+ br label %merge
+
+b2:
+ store i32 %a, i32* %p
+ br label %merge
+
+merge:
+ %cmp2 = icmp ugt i32 100, %iv
+ br i1 %cmp2, label %b3, label %b4
+
+b3:
+ store i32 %iv, i32* %p
+ br label %loop.be
+
+b4:
+ store i32 %b, i32* %p
+ br label %loop.be
+
+loop.be:
+ %iv.next = add i32 %iv, 1
+ %cmp3 = icmp sgt i32 1000, %iv.next
+ br i1 %cmp3, label %loop.entry, label %exit
+
+exit:
+ ret i32 %iv
+}
diff --git a/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 612f01e3cadee..a63617e62c0ea 100644
--- a/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -111,7 +111,7 @@ return:
; Indvars should not turn the second loop into an infinite one.
; CHECK-LABEL: @func_11(
-; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK: %tmp5 = icmp ult i32 %__key6.0, 10
; CHECK-NOT: br i1 true, label %noassert68, label %unrolledend
define i32 @func_11() nounwind uwtable {
@@ -163,7 +163,7 @@ declare void @llvm.trap() noreturn nounwind
; In this case the second loop only has a single iteration, fold the header away
; CHECK-LABEL: @func_12(
-; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK: %tmp5 = icmp ult i32 %__key6.0, 10
; CHECK: br i1 true, label %noassert68, label %unrolledend
define i32 @func_12() nounwind uwtable {
entry:
diff --git a/test/Transforms/IndVarSimplify/strengthen-overflow.ll b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
index 2bafe96e1cccd..6e0538e04d6bd 100644
--- a/test/Transforms/IndVarSimplify/strengthen-overflow.ll
+++ b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
@@ -104,5 +104,89 @@ define i32 @test.unsigned.add.1(i32* %array, i32 %length, i32 %init) {
ret i32 42
}
+define hidden void @test.shl.exact.equal() {
+; CHECK-LABEL: @test.shl.exact.equal
+entry:
+ br label %for.body
+
+for.body:
+; CHECK-LABEL: for.body
+ %k.021 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+ %shl = shl i32 1, %k.021
+ %shr1 = ashr i32 %shl, 1
+; CHECK: %shr1 = ashr exact i32 %shl, 1
+ %shr2 = lshr i32 %shl, 1
+; CHECK: %shr2 = lshr exact i32 %shl, 1
+ %inc = add nuw nsw i32 %k.021, 1
+ %exitcond = icmp eq i32 %inc, 9
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define hidden void @test.shl.exact.greater() {
+; CHECK-LABEL: @test.shl.exact.greater
+entry:
+ br label %for.body
+
+for.body:
+; CHECK-LABEL: for.body
+ %k.021 = phi i32 [ 3, %entry ], [ %inc, %for.body ]
+ %shl = shl i32 1, %k.021
+ %shr1 = ashr i32 %shl, 2
+; CHECK: %shr1 = ashr exact i32 %shl, 2
+ %shr2 = lshr i32 %shl, 2
+; CHECK: %shr2 = lshr exact i32 %shl, 2
+ %inc = add nuw nsw i32 %k.021, 1
+ %exitcond = icmp eq i32 %inc, 9
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define hidden void @test.shl.exact.unbound(i32 %arg) {
+; CHECK-LABEL: @test.shl.exact.unbound
+entry:
+ br label %for.body
+
+for.body:
+; CHECK-LABEL: for.body
+ %k.021 = phi i32 [ 2, %entry ], [ %inc, %for.body ]
+ %shl = shl i32 1, %k.021
+ %shr1 = ashr i32 %shl, 2
+; CHECK: %shr1 = ashr exact i32 %shl, 2
+ %shr2 = lshr i32 %shl, 2
+; CHECK: %shr2 = lshr exact i32 %shl, 2
+ %inc = add nuw nsw i32 %k.021, 1
+ %exitcond = icmp eq i32 %inc, %arg
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define hidden void @test.shl.nonexact() {
+; CHECK-LABEL: @test.shl.nonexact
+entry:
+ br label %for.body
+
+for.body:
+; CHECK-LABEL: for.body
+ %k.021 = phi i32 [ 2, %entry ], [ %inc, %for.body ]
+ %shl = shl i32 1, %k.021
+ %shr1 = ashr i32 %shl, 3
+; CHECK: %shr1 = ashr i32 %shl, 3
+ %shr2 = lshr i32 %shl, 3
+; CHECK: %shr2 = lshr i32 %shl, 3
+ %inc = add nuw nsw i32 %k.021, 1
+ %exitcond = icmp eq i32 %inc, 9
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
!0 = !{i32 0, i32 2}
!1 = !{i32 0, i32 42}
diff --git a/test/Transforms/IndVarSimplify/widen-loop-comp.ll b/test/Transforms/IndVarSimplify/widen-loop-comp.ll
index b87cd0550192e..2d24cd732ce84 100644
--- a/test/Transforms/IndVarSimplify/widen-loop-comp.ll
+++ b/test/Transforms/IndVarSimplify/widen-loop-comp.ll
@@ -64,7 +64,7 @@ for.end:
; CHECK-LABEL: @test2
; CHECK: for.body4.us
; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK: %cmp2.us = icmp slt i64
+; CHECK: %cmp2.us = icmp ult i64
; CHECK-NOT: %2 = trunc i64 %indvars.iv.next to i32
; CHECK-NOT: %cmp2.us = icmp slt i32
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index b566c147e9b88..1eab707540300 100644
--- a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -170,4 +170,16 @@ define { i32 addrspace(4)*, i1 } @cmpxchg_group_to_flat_wrong_operand(i32 addrsp
ret { i32 addrspace(4)*, i1 } %ret
}
+; Null pointer in local addr space
+; CHECK-LABEL: @local_nullptr
+; CHECK: icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*)
+; CHECK-NOT: i8 addrspace(3)* null
+define void @local_nullptr(i32 addrspace(1)* nocapture %results, i8 addrspace(3)* %a) {
+entry:
+ %tobool = icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*)
+ %conv = zext i1 %tobool to i32
+ store i32 %conv, i32 addrspace(1)* %results, align 4
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/test/Transforms/Inline/ARM/inline-target-attr.ll b/test/Transforms/Inline/ARM/inline-target-attr.ll
new file mode 100644
index 0000000000000..5bbecd2035288
--- /dev/null
+++ b/test/Transforms/Inline/ARM/inline-target-attr.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -inline | FileCheck %s
+; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s
+; Check that we only inline when we have compatible target attributes.
+; ARM has implemented a target attribute that will verify that the attribute
+; sets are compatible.
+
+define i32 @foo() #0 {
+entry:
+ %call = call i32 (...) @baz()
+ ret i32 %call
+; CHECK-LABEL: foo
+; CHECK: call i32 (...) @baz()
+}
+declare i32 @baz(...) #0
+
+define i32 @bar() #1 {
+entry:
+ %call = call i32 @foo()
+ ret i32 %call
+; CHECK-LABEL: bar
+; CHECK: call i32 (...) @baz()
+}
+
+define i32 @qux() #0 {
+entry:
+ %call = call i32 @bar()
+ ret i32 %call
+; CHECK-LABEL: qux
+; CHECK: call i32 @bar()
+}
+
+define i32 @thumb_fn() #2 {
+entry:
+ %call = call i32 @foo()
+ ret i32 %call
+; CHECK-LABEL: thumb_fn
+; CHECK: call i32 @foo
+}
+
+define i32 @strict_align() #3 {
+entry:
+ %call = call i32 @foo()
+ ret i32 %call
+; CHECK-LABEL: strict_align
+; CHECK: call i32 (...) @baz()
+}
+
+define i32 @soft_float_fn() #4 {
+entry:
+ %call = call i32 @foo()
+ ret i32 %call
+; CHECK-LABEL: soft_float_fn
+; CHECK: call i32 @foo
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+dsp,+neon" }
+attributes #1 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16" }
+attributes #2 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16,+thumb-mode" }
+attributes #3 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+strict-align" }
+attributes #4 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16,+soft-float" }
diff --git a/test/Transforms/Inline/ARM/lit.local.cfg b/test/Transforms/Inline/ARM/lit.local.cfg
new file mode 100644
index 0000000000000..236e1d3441665
--- /dev/null
+++ b/test/Transforms/Inline/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
index 82d321ccf225c..164f7a66a6f3c 100644
--- a/test/Transforms/Inline/cgscc-incremental-invalidate.ll
+++ b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
@@ -11,17 +11,35 @@
; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h)
; CHECK: Running analysis: DominatorTreeAnalysis on test1_f
; CHECK: Running analysis: DominatorTreeAnalysis on test1_g
-; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h)
+; CHECK: Invalidating all non-preserved analyses for: (test1_f)
; CHECK: Invalidating all non-preserved analyses for: test1_f
; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f
+; CHECK: Invalidating analysis: LoopAnalysis on test1_f
+; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_f
+; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_f
+; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h)
; CHECK: Invalidating all non-preserved analyses for: test1_g
; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
-; CHECK: Invalidating all non-preserved analyses for: test1_h
-; CHECK-NOT: Invalidating anaylsis:
-; CHECK: Running analysis: DominatorTreeAnalysis on test1_h
-; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h)
+; CHECK: Invalidating analysis: LoopAnalysis on test1_g
+; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_g
+; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_g
; CHECK: Invalidating all non-preserved analyses for: test1_h
; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h
+; CHECK: Invalidating analysis: LoopAnalysis on test1_h
+; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_h
+; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_h
+; CHECK-NOT: Invalidating analysis:
+; CHECK: Starting llvm::Function pass manager run.
+; CHECK-NEXT: Running pass: DominatorTreeVerifierPass on test1_g
+; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_g
+; CHECK-NEXT: Finished llvm::Function pass manager run.
+; CHECK-NEXT: Starting llvm::Function pass manager run.
+; CHECK-NEXT: Running pass: DominatorTreeVerifierPass on test1_h
+; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_h
+; CHECK-NEXT: Finished llvm::Function pass manager run.
+; CHECK-NOT: Invalidating analysis:
+; CHECK: Running pass: DominatorTreeVerifierPass on test1_f
+; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_f
; An external function used to control branches.
declare i1 @flag()
@@ -109,3 +127,80 @@ entry:
ret void
; CHECK: ret void
}
+
+; The 'test2_' prefixed code works to carefully trigger forming an SCC with
+; a dominator tree for one of the functions but not the other and without even
+; a function analysis manager proxy for the SCC that things get merged into.
+; Without proper handling when updating the call graph this will find a stale
+; dominator tree.
+
+@test2_global = external global i32, align 4
+
+define void @test2_hoge(i1 (i32*)* %arg) {
+; CHECK-LABEL: define void @test2_hoge(
+bb:
+ %tmp2 = call zeroext i1 %arg(i32* @test2_global)
+; CHECK: call zeroext i1 %arg(
+ br label %bb3
+
+bb3:
+ %tmp5 = call zeroext i1 %arg(i32* @test2_global)
+; CHECK: call zeroext i1 %arg(
+ br i1 %tmp5, label %bb3, label %bb6
+
+bb6:
+ ret void
+}
+
+define zeroext i1 @test2_widget(i32* %arg) {
+; CHECK-LABEL: define zeroext i1 @test2_widget(
+bb:
+ %tmp1 = alloca i8, align 1
+ %tmp2 = alloca i32, align 4
+ call void @test2_quux()
+; CHECK-NOT: call
+;
+; CHECK: call zeroext i1 @test2_widget(i32* @test2_global)
+; CHECK-NEXT: br label %[[NEW_BB:.*]]
+;
+; CHECK: [[NEW_BB]]:
+; CHECK-NEXT: call zeroext i1 @test2_widget(i32* @test2_global)
+;
+; CHECK: {{.*}}:
+
+ call void @test2_hoge.1(i32* %arg)
+; CHECK-NEXT: call void @test2_hoge.1(
+
+ %tmp4 = call zeroext i1 @test2_barney(i32* %tmp2)
+ %tmp5 = zext i1 %tmp4 to i32
+ store i32 %tmp5, i32* %tmp2, align 4
+ %tmp6 = call zeroext i1 @test2_barney(i32* null)
+ call void @test2_ham(i8* %tmp1)
+; CHECK: call void @test2_ham(
+
+ call void @test2_quux()
+; CHECK-NOT: call
+;
+; CHECK: call zeroext i1 @test2_widget(i32* @test2_global)
+; CHECK-NEXT: br label %[[NEW_BB:.*]]
+;
+; CHECK: [[NEW_BB]]:
+; CHECK-NEXT: call zeroext i1 @test2_widget(i32* @test2_global)
+;
+; CHECK: {{.*}}:
+ ret i1 true
+; CHECK-NEXT: ret i1 true
+}
+
+define internal void @test2_quux() {
+; CHECK-NOT: @test2_quux
+bb:
+ call void @test2_hoge(i1 (i32*)* @test2_widget)
+ ret void
+}
+
+declare void @test2_hoge.1(i32*)
+
+declare zeroext i1 @test2_barney(i32*)
+
+declare void @test2_ham(i8*)
diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
new file mode 100644
index 0000000000000..3c4e08b5b515c
--- /dev/null
+++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK: llvm.umul.with.overflow
+define i32 @sterix(i32, i8, i64) {
+entry:
+ %conv = zext i32 %0 to i64
+ %conv1 = sext i8 %1 to i32
+ %mul = mul i32 %conv1, 1945964878
+ %sh_prom = trunc i64 %2 to i32
+ %shr = lshr i32 %mul, %sh_prom
+ %conv2 = zext i32 %shr to i64
+ %mul3 = mul nuw nsw i64 %conv, %conv2
+ %conv6 = and i64 %mul3, 4294967295
+ %tobool = icmp ne i64 %conv6, %mul3
+ br i1 %tobool, label %lor.end, label %lor.rhs
+
+lor.rhs:
+ %and = and i64 %2, %mul3
+ %conv4 = trunc i64 %and to i32
+ %tobool7 = icmp ne i32 %conv4, 0
+ %lnot = xor i1 %tobool7, true
+ br label %lor.end
+
+lor.end:
+ %3 = phi i1 [ true, %entry ], [ %lnot, %lor.rhs ]
+ %conv8 = zext i1 %3 to i32
+ ret i32 %conv8
+}
+
diff --git a/test/Transforms/InstCombine/and-or-not.ll b/test/Transforms/InstCombine/and-or-not.ll
index 1baecb4a13a3b..04f7be01eaf5c 100644
--- a/test/Transforms/InstCombine/and-or-not.ll
+++ b/test/Transforms/InstCombine/and-or-not.ll
@@ -570,10 +570,8 @@ define i32 @xor_to_xnor1(float %fa, float %fb) {
; CHECK-LABEL: @xor_to_xnor1(
; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32
-; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]]
-; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[A]], [[B]]
-; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%a = fptosi float %fa to i32
@@ -591,10 +589,8 @@ define i32 @xor_to_xnor2(float %fa, float %fb) {
; CHECK-LABEL: @xor_to_xnor2(
; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32
-; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]]
-; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[B]], [[A]]
-; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%a = fptosi float %fa to i32
@@ -612,10 +608,8 @@ define i32 @xor_to_xnor3(float %fa, float %fb) {
; CHECK-LABEL: @xor_to_xnor3(
; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32
-; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[A]], [[B]]
-; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1
-; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[B]]
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%a = fptosi float %fa to i32
@@ -633,10 +627,8 @@ define i32 @xor_to_xnor4(float %fa, float %fb) {
; CHECK-LABEL: @xor_to_xnor4(
; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32
-; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[A]], [[B]]
-; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1
-; CHECK-NEXT: [[OR2:%.*]] = or i32 [[B]], [[A]]
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%a = fptosi float %fa to i32
diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll
index 91678a91962a8..260e2330996ed 100644
--- a/test/Transforms/InstCombine/bswap-fold.ll
+++ b/test/Transforms/InstCombine/bswap-fold.ll
@@ -1,35 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instcombine -S | FileCheck %s
-define i1 @test1(i16 %t) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 %t, 256
-; CHECK-NEXT: ret i1 [[TMP2]]
-;
- %tmp1 = call i16 @llvm.bswap.i16( i16 %t )
- %tmp2 = icmp eq i16 %tmp1, 1
- ret i1 %tmp2
-}
-
-define i1 @test2(i32 %tmp) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = icmp eq i32 %tmp, 16777216
-; CHECK-NEXT: ret i1 [[TMP_UPGRD_1]]
-;
- %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
- %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
- ret i1 %tmp.upgrd.1
-}
-
-define i1 @test3(i64 %tmp) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: [[TMP_UPGRD_2:%.*]] = icmp eq i64 %tmp, 72057594037927936
-; CHECK-NEXT: ret i1 [[TMP_UPGRD_2]]
-;
- %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
- %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
- ret i1 %tmp.upgrd.2
-}
-
; rdar://5992453
; A & 255
define i32 @test4(i32 %a) nounwind {
@@ -241,6 +212,136 @@ define i64 @bs_xor64(i64 %a, i64 %b) #0 {
ret i64 %tmp3
}
+define <2 x i32> @bs_and32vec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_and32vec(
+; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b)
+ %tmp3 = and <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @bs_or32vec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_or32vec(
+; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b)
+ %tmp3 = or <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @bs_xor32vec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_xor32vec(
+; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b)
+ %tmp3 = xor <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @bs_and32ivec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_and32ivec(
+; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440>
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = and <2 x i32> %tmp1, <i32 100001, i32 100001>
+ ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @bs_or32ivec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_or32ivec(
+; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440>
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = or <2 x i32> %tmp1, <i32 100001, i32 100001>
+ ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @bs_xor32ivec(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: @bs_xor32ivec(
+; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440>
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT: ret <2 x i32> [[TMP2]]
+;
+ %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ %tmp2 = xor <2 x i32> %tmp1, <i32 100001, i32 100001>
+ ret <2 x i32> %tmp2
+}
+
+define i64 @bs_and64_multiuse1(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64_multiuse1(
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]])
+; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP2]]
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+ %tmp3 = and i64 %tmp1, %tmp2
+ %tmp4 = mul i64 %tmp3, %tmp1 ; to increase use count of the bswaps
+ %tmp5 = mul i64 %tmp4, %tmp2 ; to increase use count of the bswaps
+ ret i64 %tmp5
+}
+
+define i64 @bs_and64_multiuse2(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64_multiuse2(
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A]], [[B:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+ %tmp3 = and i64 %tmp1, %tmp2
+ %tmp4 = mul i64 %tmp3, %tmp1 ; to increase use count of the bswaps
+ ret i64 %tmp4
+}
+
+define i64 @bs_and64_multiuse3(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64_multiuse3(
+; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+ %tmp3 = and i64 %tmp1, %tmp2
+ %tmp4 = mul i64 %tmp3, %tmp2 ; to increase use count of the bswaps
+ ret i64 %tmp4
+}
+
+define i64 @bs_and64i_multiuse(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64i_multiuse(
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]])
+; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1000000001
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT: ret i64 [[TMP3]]
+;
+ %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %tmp2 = and i64 %tmp1, 1000000001
+ %tmp3 = mul i64 %tmp2, %tmp1 ; to increase use count of the bswap
+ ret i64 %tmp3
+}
+
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
declare i64 @llvm.bswap.i64(i64)
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
diff --git a/test/Transforms/InstCombine/cmp-intrinsic.ll b/test/Transforms/InstCombine/cmp-intrinsic.ll
new file mode 100644
index 0000000000000..7fc1d12916bf8
--- /dev/null
+++ b/test/Transforms/InstCombine/cmp-intrinsic.ll
@@ -0,0 +1,123 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+declare i33 @llvm.cttz.i33(i33, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i8 @llvm.ctpop.i8(i8)
+declare i11 @llvm.ctpop.i11(i11)
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1)
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
+
+define i1 @bswap_eq_i16(i16 %x) {
+; CHECK-LABEL: @bswap_eq_i16(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 %x, 256
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %bs = call i16 @llvm.bswap.i16(i16 %x)
+ %cmp = icmp eq i16 %bs, 1
+ ret i1 %cmp
+}
+
+define i1 @bswap_ne_i32(i32 %x) {
+; CHECK-LABEL: @bswap_ne_i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 %x, 33554432
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %bs = tail call i32 @llvm.bswap.i32(i32 %x)
+ %cmp = icmp ne i32 %bs, 2
+ ret i1 %cmp
+}
+
+define <2 x i1> @bswap_eq_v2i64(<2 x i64> %x) {
+; CHECK-LABEL: @bswap_eq_v2i64(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i64> %x, <i64 216172782113783808, i64 216172782113783808>
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %bs = tail call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %x)
+ %cmp = icmp eq <2 x i64> %bs, <i64 3, i64 3>
+ ret <2 x i1> %cmp
+}
+
+define i1 @ctlz_eq_bitwidth_i32(i32 %x) {
+; CHECK-LABEL: @ctlz_eq_bitwidth_i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+ %cmp = icmp eq i32 %lz, 32
+ ret i1 %cmp
+}
+
+define <2 x i1> @ctlz_ne_bitwidth_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @ctlz_ne_bitwidth_v2i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+ %cmp = icmp ne <2 x i32> %x, <i32 32, i32 32>
+ ret <2 x i1> %cmp
+}
+
+define i1 @cttz_ne_bitwidth_i33(i33 %x) {
+; CHECK-LABEL: @cttz_ne_bitwidth_i33(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i33 %x, 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
+ %cmp = icmp ne i33 %tz, 33
+ ret i1 %cmp
+}
+
+define <2 x i1> @cttz_eq_bitwidth_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_eq_bitwidth_v2i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+ %cmp = icmp eq <2 x i32> %x, <i32 32, i32 32>
+ ret <2 x i1> %cmp
+}
+
+define i1 @ctpop_eq_zero_i11(i11 %x) {
+; CHECK-LABEL: @ctpop_eq_zero_i11(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i11 %x, 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %pop = tail call i11 @llvm.ctpop.i11(i11 %x)
+ %cmp = icmp eq i11 %pop, 0
+ ret i1 %cmp
+}
+
+define <2 x i1> @ctpop_ne_zero_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: @ctpop_ne_zero_v2i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %x, zeroinitializer
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+ %cmp = icmp ne <2 x i32> %pop, zeroinitializer
+ ret <2 x i1> %cmp
+}
+
+define i1 @ctpop_eq_bitwidth_i8(i8 %x) {
+; CHECK-LABEL: @ctpop_eq_bitwidth_i8(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 %x, -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %pop = tail call i8 @llvm.ctpop.i8(i8 %x)
+ %cmp = icmp eq i8 %pop, 8
+ ret i1 %cmp
+}
+
+define <2 x i1> @ctpop_ne_bitwidth_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: @ctpop_ne_bitwidth_v2i32(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %x, <i32 -1, i32 -1>
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+ %cmp = icmp ne <2 x i32> %pop, <i32 32, i32 32>
+ ret <2 x i1> %cmp
+}
+
diff --git a/test/Transforms/InstCombine/consecutive-fences.ll b/test/Transforms/InstCombine/consecutive-fences.ll
index 6f1c412773861..8ecb399f39cb8 100644
--- a/test/Transforms/InstCombine/consecutive-fences.ll
+++ b/test/Transforms/InstCombine/consecutive-fences.ll
@@ -4,7 +4,7 @@
; CHECK-LABEL: define void @tinkywinky
; CHECK-NEXT: fence seq_cst
-; CHECK-NEXT: fence singlethread acquire
+; CHECK-NEXT: fence syncscope("singlethread") acquire
; CHECK-NEXT: ret void
; CHECK-NEXT: }
@@ -12,21 +12,21 @@ define void @tinkywinky() {
fence seq_cst
fence seq_cst
fence seq_cst
- fence singlethread acquire
- fence singlethread acquire
- fence singlethread acquire
+ fence syncscope("singlethread") acquire
+ fence syncscope("singlethread") acquire
+ fence syncscope("singlethread") acquire
ret void
}
; CHECK-LABEL: define void @dipsy
; CHECK-NEXT: fence seq_cst
-; CHECK-NEXT: fence singlethread seq_cst
+; CHECK-NEXT: fence syncscope("singlethread") seq_cst
; CHECK-NEXT: ret void
; CHECK-NEXT: }
define void @dipsy() {
fence seq_cst
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 127fde10e9f7b..a12f4206b1c6d 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2979,9 +2979,7 @@ declare i32 @llvm.bswap.i32(i32)
define i1 @bswap_ne(i32 %x, i32 %y) {
; CHECK-LABEL: @bswap_ne(
-; CHECK-NEXT: [[SWAPX:%.*]] = call i32 @llvm.bswap.i32(i32 %x)
-; CHECK-NEXT: [[SWAPY:%.*]] = call i32 @llvm.bswap.i32(i32 %y)
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[SWAPX]], [[SWAPY]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 %x, %y
; CHECK-NEXT: ret i1 [[CMP]]
;
%swapx = call i32 @llvm.bswap.i32(i32 %x)
@@ -2994,9 +2992,7 @@ declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
define <8 x i1> @bswap_vec_eq(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: @bswap_vec_eq(
-; CHECK-NEXT: [[SWAPX:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %x)
-; CHECK-NEXT: [[SWAPY:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %y)
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x i16> [[SWAPX]], [[SWAPY]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x i16> %x, %y
; CHECK-NEXT: ret <8 x i1> [[CMP]]
;
%swapx = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %x)
@@ -3009,9 +3005,7 @@ declare i64 @llvm.bitreverse.i64(i64)
define i1 @bitreverse_eq(i64 %x, i64 %y) {
; CHECK-LABEL: @bitreverse_eq(
-; CHECK-NEXT: [[REVX:%.*]] = call i64 @llvm.bitreverse.i64(i64 %x)
-; CHECK-NEXT: [[REVY:%.*]] = call i64 @llvm.bitreverse.i64(i64 %y)
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[REVX]], [[REVY]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 %x, %y
; CHECK-NEXT: ret i1 [[CMP]]
;
%revx = call i64 @llvm.bitreverse.i64(i64 %x)
@@ -3024,9 +3018,7 @@ declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
define <8 x i1> @bitreverse_vec_ne(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: @bitreverse_vec_ne(
-; CHECK-NEXT: [[REVX:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %x)
-; CHECK-NEXT: [[REVY:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %y)
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[REVX]], [[REVY]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> %x, %y
; CHECK-NEXT: ret <8 x i1> [[CMP]]
;
%revx = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %x)
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index c294d79f15efe..8d2f06edcaf3d 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -475,66 +475,6 @@ define <2 x i1> @ctlz_knownbits3_vec(<2 x i8> %arg) {
ret <2 x i1> %res
}
-define void @cmp.simplify(i32 %a, i32 %b, i1* %c) {
- %lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone
- %lz.cmp = icmp eq i32 %lz, 32
- store volatile i1 %lz.cmp, i1* %c
- %tz = tail call i32 @llvm.cttz.i32(i32 %a, i1 false) nounwind readnone
- %tz.cmp = icmp ne i32 %tz, 32
- store volatile i1 %tz.cmp, i1* %c
- %pop0 = tail call i32 @llvm.ctpop.i32(i32 %b) nounwind readnone
- %pop0.cmp = icmp eq i32 %pop0, 0
- store volatile i1 %pop0.cmp, i1* %c
- %pop1 = tail call i32 @llvm.ctpop.i32(i32 %b) nounwind readnone
- %pop1.cmp = icmp eq i32 %pop1, 32
- store volatile i1 %pop1.cmp, i1* %c
- ret void
-; CHECK: @cmp.simplify
-; CHECK-NEXT: %lz.cmp = icmp eq i32 %a, 0
-; CHECK-NEXT: store volatile i1 %lz.cmp, i1* %c
-; CHECK-NEXT: %tz.cmp = icmp ne i32 %a, 0
-; CHECK-NEXT: store volatile i1 %tz.cmp, i1* %c
-; CHECK-NEXT: %pop0.cmp = icmp eq i32 %b, 0
-; CHECK-NEXT: store volatile i1 %pop0.cmp, i1* %c
-; CHECK-NEXT: %pop1.cmp = icmp eq i32 %b, -1
-; CHECK-NEXT: store volatile i1 %pop1.cmp, i1* %c
-}
-
-define <2 x i1> @ctlz_cmp_vec(<2 x i32> %a) {
-; CHECK-LABEL: @ctlz_cmp_vec(
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
-; CHECK-NEXT: ret <2 x i1> [[CMP]]
-;
- %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind readnone
- %cmp = icmp eq <2 x i32> %x, <i32 32, i32 32>
- ret <2 x i1> %cmp
-}
-
-define <2 x i1> @cttz_cmp_vec(<2 x i32> %a) {
-; CHECK-LABEL: @cttz_cmp_vec(
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer
-; CHECK-NEXT: ret <2 x i1> [[CMP]]
-;
- %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) nounwind readnone
- %cmp = icmp ne <2 x i32> %x, <i32 32, i32 32>
- ret <2 x i1> %cmp
-}
-
-define void @ctpop_cmp_vec(<2 x i32> %a, <2 x i1>* %b) {
- %pop0 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) nounwind readnone
- %pop0.cmp = icmp eq <2 x i32> %pop0, zeroinitializer
- store volatile <2 x i1> %pop0.cmp, <2 x i1>* %b
- %pop1 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) nounwind readnone
- %pop1.cmp = icmp eq <2 x i32> %pop1, < i32 32, i32 32 >
- store volatile <2 x i1> %pop1.cmp, <2 x i1>* %b
- ret void
-; CHECK-LABEL: @ctpop_cmp_vec(
-; CHECK-NEXT: %pop0.cmp = icmp eq <2 x i32> %a, zeroinitializer
-; CHECK-NEXT: store volatile <2 x i1> %pop0.cmp, <2 x i1>* %b
-; CHECK-NEXT: %pop1.cmp = icmp eq <2 x i32> %a, <i32 -1, i32 -1>
-; CHECK-NEXT: store volatile <2 x i1> %pop1.cmp, <2 x i1>* %b
-}
-
define i32 @ctlz_undef(i32 %Value) {
; CHECK-LABEL: @ctlz_undef(
; CHECK-NEXT: ret i32 undef
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index 2164f0df8d279..947971c6c83b0 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -348,10 +348,8 @@ define i8 @test18(i8 %A, i8 %B) {
; ((x | y) ^ (~x | ~y)) -> ~(x ^ y)
define i32 @test19(i32 %x, i32 %y) {
; CHECK-LABEL: @test19(
-; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[X]], [[Y]]
-; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%noty = xor i32 %y, -1
@@ -365,10 +363,8 @@ define i32 @test19(i32 %x, i32 %y) {
; ((x | y) ^ (~y | ~x)) -> ~(x ^ y)
define i32 @test20(i32 %x, i32 %y) {
; CHECK-LABEL: @test20(
-; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[Y]], [[X]]
-; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%noty = xor i32 %y, -1
@@ -382,10 +378,8 @@ define i32 @test20(i32 %x, i32 %y) {
; ((~x | ~y) ^ (x | y)) -> ~(x ^ y)
define i32 @test21(i32 %x, i32 %y) {
; CHECK-LABEL: @test21(
-; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1
-; CHECK-NEXT: [[OR2:%.*]] = or i32 [[X]], [[Y]]
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%noty = xor i32 %y, -1
@@ -399,10 +393,8 @@ define i32 @test21(i32 %x, i32 %y) {
; ((~x | ~y) ^ (y | x)) -> ~(x ^ y)
define i32 @test22(i32 %x, i32 %y) {
; CHECK-LABEL: @test22(
-; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1
-; CHECK-NEXT: [[OR2:%.*]] = or i32 [[Y]], [[X]]
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1
; CHECK-NEXT: ret i32 [[XOR]]
;
%noty = xor i32 %y, -1
diff --git a/test/Transforms/InstCombine/pr33689_same_bitwidth.ll b/test/Transforms/InstCombine/pr33689_same_bitwidth.ll
new file mode 100644
index 0000000000000..e5dd019b9b519
--- /dev/null
+++ b/test/Transforms/InstCombine/pr33689_same_bitwidth.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine %s -o - | FileCheck %s
+
+; All the "useless" instructions should be removed and we shouldn't crash.
+
+target datalayout = "p:16:16"
+
+%i64_t = type i64
+
+@a = external global i16
+@b = external global i16*
+
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT: bb0:
+; CHECK-NEXT: [[TMP12:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT: [[TMP12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP12]], i16 0, i16 0
+; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK: bb1:
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint [2 x i32]* [[TMP12]] to i16
+; CHECK-NEXT: store i16 [[TMP8]], i16* @a, align 2
+; CHECK-NEXT: unreachable
+; CHECK: bb2:
+; CHECK-NEXT: [[TMP9:%.*]] = load i16*, i16** @b, align 2
+; CHECK-NEXT: store i16 0, i16* [[TMP9]], align 2
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP12_SUB]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -1
+; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP12_SUB]], align 8
+; CHECK-NEXT: ret void
+;
+bb0:
+ %tmp1 = alloca %i64_t
+ %tmp2 = bitcast %i64_t* %tmp1 to i32*
+ %useless3 = bitcast %i64_t* %tmp1 to i16*
+ %useless4 = getelementptr inbounds i16, i16* %useless3, i16 undef
+ %useless5 = bitcast i16* %useless4 to i32*
+ br i1 undef, label %bb1, label %bb2
+
+bb1: ; preds = %bb0
+ %useless6 = insertvalue [1 x i32*] undef, i32* %tmp2, 0
+ %useless7 = insertvalue [1 x i32*] %useless6, i32* null, 0
+ %tmp8 = ptrtoint i32* %tmp2 to i16
+ store i16 %tmp8, i16* @a
+ unreachable
+
+bb2: ; preds = %bb0
+ %tmp9 = load i16*, i16** @b
+ store i16 0, i16* %tmp9
+ %tmp10 = load i32, i32* %tmp2
+ %tmp11 = sub i32 %tmp10, 1
+ store i32 %tmp11, i32* %tmp2
+ ret void
+}
diff --git a/test/Transforms/InstCombine/select-implied.ll b/test/Transforms/InstCombine/select-implied.ll
index 2100e3eae0089..2558745c18f3c 100644
--- a/test/Transforms/InstCombine/select-implied.ll
+++ b/test/Transforms/InstCombine/select-implied.ll
@@ -121,3 +121,80 @@ end:
declare void @foo(i32)
declare i32 @bar(i32)
+
+; CHECK-LABEL: @test_and
+; CHECK: tpath:
+; CHECK-NOT: select
+; CHECK: ret i32 313
+define i32 @test_and(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp ne i32 %a, 0
+ %cmp2 = icmp ne i32 %b, 0
+ %and = and i1 %cmp1, %cmp2
+ br i1 %and, label %tpath, label %end
+
+tpath:
+ %cmp3 = icmp eq i32 %a, 0 ;; <-- implied false
+ %c = select i1 %cmp3, i32 0, i32 313
+ ret i32 %c
+
+end:
+ ret i32 0
+}
+
+; cmp1 and cmp2 are false on the 'fpath' path and thus cmp3 is true.
+; CHECK-LABEL: @test_or1
+; CHECK: fpath:
+; CHECK-NOT: select
+; CHECK: ret i32 37
+define i32 @test_or1(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp2 = icmp eq i32 %b, 0
+ %or = or i1 %cmp1, %cmp2
+ br i1 %or, label %end, label %fpath
+
+fpath:
+ %cmp3 = icmp ne i32 %a, 0 ;; <-- implied true
+ %c = select i1 %cmp3, i32 37, i32 0
+ ret i32 %c
+
+end:
+ ret i32 0
+}
+
+; LHS ==> RHS by definition (true -> true)
+; CHECK-LABEL: @test6
+; CHECK: taken:
+; CHECK-NOT: select
+; CHECK: call void @foo(i32 10)
+define void @test6(i32 %a, i32 %b) {
+ %cmp1 = icmp eq i32 %a, %b
+ br i1 %cmp1, label %taken, label %end
+
+taken:
+ %c = select i1 %cmp1, i32 10, i32 0
+ call void @foo(i32 %c)
+ br label %end
+
+end:
+ ret void
+}
+
+; LHS ==> RHS by definition (false -> false)
+; CHECK-LABEL: @test7
+; CHECK: taken:
+; CHECK-NOT: select
+; CHECK: call void @foo(i32 11)
+define void @test7(i32 %a, i32 %b) {
+ %cmp1 = icmp eq i32 %a, %b
+ br i1 %cmp1, label %end, label %taken
+
+taken:
+ %c = select i1 %cmp1, i32 0, i32 11
+ call void @foo(i32 %c)
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index c8f2a50b72eda..acfa053daaf8d 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -1370,3 +1370,10 @@ define i8 @assume_cond_false(i1 %cond, i8 %x, i8 %y) {
ret i8 %sel
}
+; Test case to make sure we don't consider an all ones float values for converting the select into a sext.
+define <4 x float> @PR33721(<4 x float> %w) {
+entry:
+ %0 = fcmp ole <4 x float> %w, zeroinitializer
+ %1 = select <4 x i1> %0, <4 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <4 x float> zeroinitializer
+ ret <4 x float> %1
+}
diff --git a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
index 5938f9d7321d6..715c9413a8196 100644
--- a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
+++ b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
@@ -854,3 +854,32 @@ define void @load_factor2_fp128(<4 x fp128>* %ptr) {
%v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
ret void
}
+
+define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
+; NEON-LABEL: @load_factor2_wide_pointer(
+; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32*
+; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT: [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
+; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT: [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
+; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT: [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
+; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
+; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT: ret void
+; NO_NEON-LABEL: @load_factor2_wide_pointer(
+; NO_NEON-NOT: @llvm.arm.neon
+; NO_NEON: ret void
+;
+ %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
+ %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ ret void
+}
diff --git a/test/Transforms/LoopRotate/pr33701.ll b/test/Transforms/LoopRotate/pr33701.ll
new file mode 100644
index 0000000000000..ed162b1209828
--- /dev/null
+++ b/test/Transforms/LoopRotate/pr33701.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+
+define void @func() {
+bb0:
+ br label %bb1
+
+bb1: ; preds = %bb4, %bb0
+ %0 = phi i16 [ %2, %bb4 ], [ 0, %bb0 ]
+ %1 = icmp sle i16 %0, 2
+ br i1 %1, label %bb2, label %bb5
+
+bb2: ; preds = %bb1
+ br i1 undef, label %bb6, label %bb4
+
+bb3: ; No predecessors!
+ br label %bb6
+
+bb4: ; preds = %bb2
+ %2 = add i16 undef, 1
+ br label %bb1
+
+bb5: ; preds = %bb1
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb3, %bb2
+ unreachable
+}
diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
index dcd068191e105..ea3f607723197 100644
--- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -14,8 +14,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; current LSR cost model.
; CHECK-NOT: = ptrtoint i8* undef to i64
; CHECK: .lr.ph
-; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
-; CHECK: sub i64 [[TMP]], %tmp6
+; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1
+; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}}
; CHECK: ret void
define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
bb:
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
new file mode 100644
index 0000000000000..4ce6f1a79fbfb
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ham = type { i8, i8, [5 x i32], i64, i64, i64 }
+
+@global = external local_unnamed_addr global %struct.ham, align 8
+
+define void @foo() local_unnamed_addr {
+bb:
+ %tmp = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 3), align 8
+ %tmp1 = and i64 %tmp, 1792
+ %tmp2 = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 4), align 8
+ %tmp3 = add i64 %tmp1, %tmp2
+ %tmp4 = load i8*, i8** null, align 8
+ %tmp5 = getelementptr inbounds i8, i8* %tmp4, i64 0
+ %tmp6 = sub i64 0, %tmp3
+ %tmp7 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp6
+ %tmp8 = inttoptr i64 0 to i8*
+ br label %bb9
+
+; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy
+; to narrow LSR search space by picking winner reg will generate only one lsr.iv and
+; unoptimal result.
+; CHECK-LABEL: @foo(
+; CHECK: bb9:
+; CHECK-NEXT: = phi i8*
+; CHECK-NEXT: = phi i8*
+
+bb9: ; preds = %bb12, %bb
+ %tmp10 = phi i8* [ %tmp7, %bb ], [ %tmp16, %bb12 ]
+ %tmp11 = phi i8* [ %tmp8, %bb ], [ %tmp17, %bb12 ]
+ br i1 false, label %bb18, label %bb12
+
+bb12: ; preds = %bb9
+ %tmp13 = getelementptr inbounds i8, i8* %tmp10, i64 8
+ %tmp14 = bitcast i8* %tmp13 to i64*
+ %tmp15 = load i64, i64* %tmp14, align 1
+ %tmp16 = getelementptr inbounds i8, i8* %tmp10, i64 16
+ %tmp17 = getelementptr inbounds i8, i8* %tmp11, i64 16
+ br label %bb9
+
+bb18: ; preds = %bb9
+ %tmp19 = icmp ugt i8* %tmp11, null
+ %tmp20 = getelementptr inbounds i8, i8* %tmp10, i64 8
+ %tmp21 = getelementptr inbounds i8, i8* %tmp11, i64 8
+ %tmp22 = select i1 %tmp19, i8* %tmp10, i8* %tmp20
+ %tmp23 = select i1 %tmp19, i8* %tmp11, i8* %tmp21
+ br label %bb24
+
+bb24: ; preds = %bb24, %bb18
+ %tmp25 = phi i8* [ %tmp27, %bb24 ], [ %tmp22, %bb18 ]
+ %tmp26 = phi i8* [ %tmp29, %bb24 ], [ %tmp23, %bb18 ]
+ %tmp27 = getelementptr inbounds i8, i8* %tmp25, i64 1
+ %tmp28 = load i8, i8* %tmp25, align 1
+ %tmp29 = getelementptr inbounds i8, i8* %tmp26, i64 1
+ store i8 %tmp28, i8* %tmp26, align 1
+ %tmp30 = icmp eq i8* %tmp29, %tmp5
+ br label %bb24
+}
diff --git a/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
index 1f31a133e34d9..73672e14f78ac 100644
--- a/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -1,29 +1,52 @@
-; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S| FileCheck %s
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefix=EPILOG-NO-IC
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-runtime-epilog=false -unroll-count=2 -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine
-; the second RUN generates an epilog remainder block for all the test
+; the third and fifth RUNs generate an epilog/prolog remainder block for all the test
; cases below (it does not generate a loop).
; test with three exiting and three exit blocks.
; none of the exit blocks have successors
define void @test1(i64 %trip, i1 %cond) {
-; CHECK-LABEL: test1
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
-; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
-; CHECK-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]]
-; CHECK: entry.new:
-; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TRIP]], [[XTRAITER]]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK-LABEL: loop_latch.epil:
-; CHECK-NEXT: %epil.iter.sub = add i64 %epil.iter, -1
-; CHECK-NEXT: %epil.iter.cmp = icmp eq i64 %epil.iter.sub, 0
-; CHECK-NEXT: br i1 %epil.iter.cmp, label %exit2.loopexit.epilog-lcssa, label %loop_header.epil
-; CHECK-LABEL: loop_latch.7:
-; CHECK-NEXT: %niter.nsub.7 = add i64 %niter, -8
-; CHECK-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
-; CHECK-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header
+; EPILOG: test1(
+; EPILOG-NEXT: entry:
+; EPILOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
+; EPILOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
+; EPILOG-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; EPILOG-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]]
+; EPILOG: entry.new:
+; EPILOG-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TRIP]], [[XTRAITER]]
+; EPILOG-NEXT: br label [[LOOP_HEADER:%.*]]
+; EPILOG: loop_latch.epil:
+; EPILOG-NEXT: %epil.iter.sub = add i64 %epil.iter, -1
+; EPILOG-NEXT: %epil.iter.cmp = icmp eq i64 %epil.iter.sub, 0
+; EPILOG-NEXT: br i1 %epil.iter.cmp, label %exit2.loopexit.epilog-lcssa, label %loop_header.epil
+; EPILOG: loop_latch.7:
+; EPILOG-NEXT: %niter.nsub.7 = add i64 %niter, -8
+; EPILOG-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header
+
+; PROLOG: test1(
+; PROLOG-NEXT: entry:
+; PROLOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
+; PROLOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
+; PROLOG-NEXT: [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; PROLOG-NEXT: br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader
+; PROLOG: loop_header.prol:
+; PROLOG-NEXT: %iv.prol = phi i64 [ 0, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ]
+; PROLOG-NEXT: %prol.iter = phi i64 [ [[XTRAITER]], %loop_header.prol.preheader ], [ %prol.iter.sub, %loop_latch.prol ]
+; PROLOG-NEXT: br i1 %cond, label %loop_latch.prol, label %loop_exiting_bb1.prol
+; PROLOG: loop_latch.prol:
+; PROLOG-NEXT: %iv_next.prol = add i64 %iv.prol, 1
+; PROLOG-NEXT: %prol.iter.sub = add i64 %prol.iter, -1
+; PROLOG-NEXT: %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
+; PROLOG-NEXT: br i1 %prol.iter.cmp, label %loop_header.prol.loopexit.unr-lcssa, label %loop_header.prol
+; PROLOG: loop_latch.7:
+; PROLOG-NEXT: %iv_next.7 = add i64 %iv, 8
+; PROLOG-NEXT: %cmp.7 = icmp eq i64 %iv_next.7, %trip
+; PROLOG-NEXT: br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header
entry:
br label %loop_header
@@ -59,17 +82,30 @@ exit2.loopexit:
; %sum.02 and %add. Both of these are incoming values for phi from every exiting
; unrolled block.
define i32 @test2(i32* nocapture %a, i64 %n) {
-; CHECK-LABEL: test2
-; CHECK-LABEL: for.exit2.loopexit:
-; CHECK-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ],
-; CHECK-NEXT: br label %for.exit2
-; CHECK-LABEL: for.exit2.loopexit2:
-; CHECK-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
-; CHECK-NEXT: br label %for.exit2
-; CHECK-LABEL: for.exit2:
-; CHECK-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
-; CHECK-NEXT: ret i32 %retval
-; CHECK: %niter.nsub.7 = add i64 %niter, -8
+; EPILOG: test2(
+; EPILOG: for.exit2.loopexit:
+; EPILOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ],
+; EPILOG-NEXT: br label %for.exit2
+; EPILOG: for.exit2.loopexit2:
+; EPILOG-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
+; EPILOG-NEXT: br label %for.exit2
+; EPILOG: for.exit2:
+; EPILOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
+; EPILOG-NEXT: ret i32 %retval
+; EPILOG: %niter.nsub.7 = add i64 %niter, -8
+
+; PROLOG: test2(
+; PROLOG: for.exit2.loopexit:
+; PROLOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ],
+; PROLOG-NEXT: br label %for.exit2
+; PROLOG: for.exit2.loopexit1:
+; PROLOG-NEXT: %retval.ph2 = phi i32 [ 42, %for.exiting_block.prol ], [ %sum.02.prol, %header.prol ]
+; PROLOG-NEXT: br label %for.exit2
+; PROLOG: for.exit2:
+; PROLOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph2, %for.exit2.loopexit1 ]
+; PROLOG-NEXT: ret i32 %retval
+; PROLOG: %indvars.iv.next.7 = add i64 %indvars.iv, 8
+
entry:
br label %header
@@ -102,25 +138,42 @@ for.exit2:
; test with two exiting and three exit blocks.
; the non-latch exiting block has a switch.
define void @test3(i64 %trip, i64 %add) {
-; CHECK-LABEL: test3
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
-; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
-; CHECK-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]]
-; CHECK: entry.new:
-; CHECK-NEXT: %unroll_iter = sub i64 [[TRIP]], [[XTRAITER]]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK-LABEL: loop_header:
-; CHECK-NEXT: %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ]
-; CHECK-NEXT: %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ]
-; CHECK-LABEL: loop_exiting_bb1.7:
-; CHECK-NEXT: switch i64 %sum.next.6, label %loop_latch.7
-; CHECK-LABEL: loop_latch.7:
-; CHECK-NEXT: %sum.next.7 = add i64 %sum.next.6, %add
-; CHECK-NEXT: %niter.nsub.7 = add i64 %niter, -8
-; CHECK-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
-; CHECK-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header
+; EPILOG: test3(
+; EPILOG-NEXT: entry:
+; EPILOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
+; EPILOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
+; EPILOG-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; EPILOG-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]]
+; EPILOG: entry.new:
+; EPILOG-NEXT: %unroll_iter = sub i64 [[TRIP]], [[XTRAITER]]
+; EPILOG-NEXT: br label [[LOOP_HEADER:%.*]]
+; EPILOG: loop_header:
+; EPILOG-NEXT: %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ]
+; EPILOG-NEXT: %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ]
+; EPILOG: loop_exiting_bb1.7:
+; EPILOG-NEXT: switch i64 %sum.next.6, label %loop_latch.7
+; EPILOG: loop_latch.7:
+; EPILOG-NEXT: %sum.next.7 = add i64 %sum.next.6, %add
+; EPILOG-NEXT: %niter.nsub.7 = add i64 %niter, -8
+; EPILOG-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header
+
+; PROLOG: test3(
+; PROLOG-NEXT: entry:
+; PROLOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1
+; PROLOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7
+; PROLOG-NEXT: [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; PROLOG-NEXT: br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader
+; PROLOG: loop_header:
+; PROLOG-NEXT: %iv = phi i64 [ %iv.unr, %entry.new ], [ %iv_next.7, %loop_latch.7 ]
+; PROLOG-NEXT: %sum = phi i64 [ %sum.unr, %entry.new ], [ %sum.next.7, %loop_latch.7 ]
+; PROLOG: loop_exiting_bb1.7:
+; PROLOG-NEXT: switch i64 %sum.next.6, label %loop_latch.7
+; PROLOG: loop_latch.7:
+; PROLOG-NEXT: %iv_next.7 = add nsw i64 %iv, 8
+; PROLOG-NEXT: %sum.next.7 = add i64 %sum.next.6, %add
+; PROLOG-NEXT: %cmp.7 = icmp eq i64 %iv_next.7, %trip
+; PROLOG-NEXT: br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header
entry:
br label %loop_header
@@ -153,9 +206,13 @@ exit2.loopexit:
; FIXME: Support multiple exiting blocks to the same latch exit block.
define i32 @test4(i32* nocapture %a, i64 %n, i1 %cond) {
-; CHECK-LABEL: test4
-; CHECK-NOT: .unr
-; CHECK-NOT: .epil
+; EPILOG: test4(
+; EPILOG-NOT: .unr
+; EPILOG-NOT: .epil
+
+; PROLOG: test4(
+; PROLOG-NOT: .unr
+; PROLOG-NOT: .prol
entry:
br label %header
@@ -184,21 +241,68 @@ for.exit2:
ret i32 42
}
+; FIXME: Support multiple exiting blocks to the unique exit block.
+define void @unique_exit(i32 %arg) {
+; EPILOG: unique_exit(
+; EPILOG-NOT: .unr
+; EPILOG-NOT: .epil
+
+; PROLOG: unique_exit(
+; PROLOG-NOT: .unr
+; PROLOG-NOT: .prol
+entry:
+ %tmp = icmp sgt i32 undef, %arg
+ br i1 %tmp, label %preheader, label %returnblock
+
+preheader: ; preds = %entry
+ br label %header
+
+LoopExit: ; preds = %header, %latch
+ %tmp2.ph = phi i32 [ %tmp4, %header ], [ -1, %latch ]
+ br label %returnblock
+
+returnblock: ; preds = %LoopExit, %entry
+ %tmp2 = phi i32 [ -1, %entry ], [ %tmp2.ph, %LoopExit ]
+ ret void
+
+header: ; preds = %preheader, %latch
+ %tmp4 = phi i32 [ %inc, %latch ], [ %arg, %preheader ]
+ %inc = add nsw i32 %tmp4, 1
+ br i1 true, label %LoopExit, label %latch
+
+latch: ; preds = %header
+ %cmp = icmp slt i32 %inc, undef
+ br i1 %cmp, label %header, label %LoopExit
+}
+
; two exiting and two exit blocks.
; the non-latch exiting block has duplicate edges to the non-latch exit block.
define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
-; CHECK-LABEL: test5
-; CHECK-LABEL: exit1.loopexit:
-; CHECK-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ],
-; CHECK-NEXT: br label %exit1
-; CHECK-LABEL: exit1.loopexit2:
-; CHECK-NEXT: %ivy.epil = add i64 %iv.epil, %add
-; CHECK-NEXT: br label %exit1
-; CHECK-LABEL: exit1:
-; CHECK-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.epil, %exit1.loopexit2 ]
-; CHECK-NEXT: ret i64 %result
-; CHECK-LABEL: loop_latch.7:
-; CHECK: %niter.nsub.7 = add i64 %niter, -8
+; EPILOG: test5(
+; EPILOG: exit1.loopexit:
+; EPILOG-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ],
+; EPILOG-NEXT: br label %exit1
+; EPILOG: exit1.loopexit2:
+; EPILOG-NEXT: %ivy.epil = add i64 %iv.epil, %add
+; EPILOG-NEXT: br label %exit1
+; EPILOG: exit1:
+; EPILOG-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.epil, %exit1.loopexit2 ]
+; EPILOG-NEXT: ret i64 %result
+; EPILOG: loop_latch.7:
+; EPILOG: %niter.nsub.7 = add i64 %niter, -8
+
+; PROLOG: test5(
+; PROLOG: exit1.loopexit:
+; PROLOG-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ],
+; PROLOG-NEXT: br label %exit1
+; PROLOG: exit1.loopexit1:
+; PROLOG-NEXT: %ivy.prol = add i64 %iv.prol, %add
+; PROLOG-NEXT: br label %exit1
+; PROLOG: exit1:
+; PROLOG-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.prol, %exit1.loopexit1 ]
+; PROLOG-NEXT: ret i64 %result
+; PROLOG: loop_latch.7:
+; PROLOG: %iv_next.7 = add nsw i64 %iv, 8
entry:
br label %loop_header
@@ -230,18 +334,31 @@ latchexit:
; test when exit blocks have successors.
define i32 @test6(i32* nocapture %a, i64 %n, i1 %cond, i32 %x) {
-; CHECK-LABEL: test6
-; CHECK-LABEL: for.exit2.loopexit:
-; CHECK-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ],
-; CHECK-NEXT: br label %for.exit2
-; CHECK-LABEL: for.exit2.loopexit2:
-; CHECK-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
-; CHECK-NEXT: br label %for.exit2
-; CHECK-LABEL: for.exit2:
-; CHECK-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
-; CHECK-NEXT: br i1 %cond, label %exit_true, label %exit_false
-; CHECK-LABEL: latch.7:
-; CHECK: %niter.nsub.7 = add i64 %niter, -8
+; EPILOG: test6(
+; EPILOG: for.exit2.loopexit:
+; EPILOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ],
+; EPILOG-NEXT: br label %for.exit2
+; EPILOG: for.exit2.loopexit2:
+; EPILOG-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
+; EPILOG-NEXT: br label %for.exit2
+; EPILOG: for.exit2:
+; EPILOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
+; EPILOG-NEXT: br i1 %cond, label %exit_true, label %exit_false
+; EPILOG: latch.7:
+; EPILOG: %niter.nsub.7 = add i64 %niter, -8
+
+; PROLOG: test6(
+; PROLOG: for.exit2.loopexit:
+; PROLOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ],
+; PROLOG-NEXT: br label %for.exit2
+; PROLOG: for.exit2.loopexit1:
+; PROLOG-NEXT: %retval.ph2 = phi i32 [ 42, %for.exiting_block.prol ], [ %sum.02.prol, %header.prol ]
+; PROLOG-NEXT: br label %for.exit2
+; PROLOG: for.exit2:
+; PROLOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph2, %for.exit2.loopexit1 ]
+; PROLOG-NEXT: br i1 %cond, label %exit_true, label %exit_false
+; PROLOG: latch.7:
+; PROLOG: %indvars.iv.next.7 = add i64 %indvars.iv, 8
entry:
br label %header
@@ -277,3 +394,87 @@ exit_true:
exit_false:
ret i32 %addx
}
+
+; test when value in exit block does not have VMap.
+define i32 @test7(i32 %arg, i32 %arg1, i32 %arg2) {
+; EPILOG-NO-IC: test7(
+; EPILOG-NO-IC: loopexit1.loopexit:
+; EPILOG-NO-IC-NEXT: %sext3.ph = phi i32 [ %shft, %header ], [ %shft, %latch ], [ %shft, %latch.1 ], [ %shft, %latch.2 ], [ %shft, %latch.3 ], [ %shft, %latch.4 ], [ %shft, %latch.5 ], [ %shft, %latch.6 ]
+; EPILOG-NO-IC-NEXT: br label %loopexit1
+; EPILOG-NO-IC: loopexit1.loopexit1:
+; EPILOG-NO-IC-NEXT: %sext3.ph2 = phi i32 [ %shft, %header.epil ]
+; EPILOG-NO-IC-NEXT: br label %loopexit1
+; EPILOG-NO-IC: loopexit1:
+; EPILOG-NO-IC-NEXT: %sext3 = phi i32 [ %sext3.ph, %loopexit1.loopexit ], [ %sext3.ph2, %loopexit1.loopexit1 ]
+bb:
+ %tmp = icmp slt i32 undef, 2
+ %sext = sext i32 undef to i64
+ %shft = ashr exact i32 %arg, 16
+ br i1 %tmp, label %loopexit2, label %preheader
+
+preheader: ; preds = %bb2
+ br label %header
+
+header: ; preds = %latch, %preheader
+ %tmp6 = phi i64 [ 1, %preheader ], [ %add, %latch ]
+ br i1 false, label %loopexit1, label %latch
+
+latch: ; preds = %header
+ %add = add nuw nsw i64 %tmp6, 1
+ %tmp9 = icmp slt i64 %add, %sext
+ br i1 %tmp9, label %header, label %latchexit
+
+latchexit: ; preds = %latch
+ unreachable
+
+loopexit2: ; preds = %bb2
+ ret i32 %shft
+
+loopexit1: ; preds = %header
+ %sext3 = phi i32 [ %shft, %header ]
+ ret i32 %sext3
+}
+
+; Nested loop and inner loop is unrolled
+; FIXME: we cannot unroll with epilog remainder currently, because
+; the outer loop does not contain the epilog preheader and epilog exit (while
+; infact it should). This causes us to choke up on LCSSA form being incorrect in
+; outer loop. However, the exit block where LCSSA fails, is infact still within
+; the outer loop. For now, we just bail out in presence of outer loop and epilog
+; loop is generated.
+; The outer loop header is the preheader for the inner loop and the inner header
+; branches back to the outer loop.
+define void @test8() {
+; EPILOG: test8(
+; EPILOG-NOT: niter
+
+; PROLOG: test8(
+; PROLOG: outerloop:
+; PROLOG-NEXT: phi i64 [ 3, %bb ], [ 0, %outerloop.loopexit ]
+; PROLOG: %lcmp.mod = icmp eq i64
+; PROLOG-NEXT: br i1 %lcmp.mod, label %innerH.prol.loopexit, label %innerH.prol.preheader
+; PROLOG: latch.6:
+; PROLOG-NEXT: %tmp4.7 = add nsw i64 %tmp3, 8
+; PROLOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.7
+; PROLOG: latch.7
+; PROLOG-NEXT: %tmp6.7 = icmp ult i64 %tmp4.7, 100
+; PROLOG-NEXT: br i1 %tmp6.7, label %innerH, label %exit.unr-lcssa
+bb:
+ br label %outerloop
+
+outerloop: ; preds = %innerH, %bb
+ %tmp = phi i64 [ 3, %bb ], [ 0, %innerH ]
+ br label %innerH
+
+innerH: ; preds = %latch, %outerloop
+ %tmp3 = phi i64 [ %tmp4, %latch ], [ %tmp, %outerloop ]
+ %tmp4 = add nuw nsw i64 %tmp3, 1
+ br i1 false, label %outerloop, label %latch
+
+latch: ; preds = %innerH
+ %tmp6 = icmp ult i64 %tmp4, 100
+ br i1 %tmp6, label %innerH, label %exit
+
+exit: ; preds = %latch
+ ret void
+}
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index 04661314eb1d7..878f4e8c78f0f 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -170,6 +170,74 @@ for.end: ; preds = %for.cond.for.end_cr
ret i16 %res.0.lcssa
}
+; dont unroll loop with multiple exit/exiting blocks, unless
+; -runtime-unroll-multi-exit=true
+; single exit, multiple exiting blocks.
+define void @unique_exit(i32 %arg) {
+; PROLOG: unique_exit(
+; PROLOG-NOT: .unr
+
+; EPILOG: unique_exit(
+; EPILOG-NOT: .unr
+entry:
+ %tmp = icmp sgt i32 undef, %arg
+ br i1 %tmp, label %preheader, label %returnblock
+
+preheader: ; preds = %entry
+ br label %header
+
+LoopExit: ; preds = %header, %latch
+ %tmp2.ph = phi i32 [ %tmp4, %header ], [ -1, %latch ]
+ br label %returnblock
+
+returnblock: ; preds = %LoopExit, %entry
+ %tmp2 = phi i32 [ -1, %entry ], [ %tmp2.ph, %LoopExit ]
+ ret void
+
+header: ; preds = %preheader, %latch
+ %tmp4 = phi i32 [ %inc, %latch ], [ %arg, %preheader ]
+ %inc = add nsw i32 %tmp4, 1
+ br i1 true, label %LoopExit, label %latch
+
+latch: ; preds = %header
+ %cmp = icmp slt i32 %inc, undef
+ br i1 %cmp, label %header, label %LoopExit
+}
+
+; multiple exit blocks. don't unroll
+define void @multi_exit(i64 %trip, i1 %cond) {
+; PROLOG: multi_exit(
+; PROLOG-NOT: .unr
+
+; EPILOG: multi_exit(
+; EPILOG-NOT: .unr
+entry:
+ br label %loop_header
+
+loop_header:
+ %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ]
+ br i1 %cond, label %loop_latch, label %loop_exiting_bb1
+
+loop_exiting_bb1:
+ br i1 false, label %loop_exiting_bb2, label %exit1
+
+loop_exiting_bb2:
+ br i1 false, label %loop_latch, label %exit3
+
+exit3:
+ ret void
+
+loop_latch:
+ %iv_next = add i64 %iv, 1
+ %cmp = icmp ne i64 %iv_next, %trip
+ br i1 %cmp, label %loop_header, label %exit2.loopexit
+
+exit1:
+ ret void
+
+exit2.loopexit:
+ ret void
+}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.unroll.runtime.disable"}
diff --git a/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
new file mode 100644
index 0000000000000..cd3e89ae73504
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; REQUIRES: asserts
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1.
+ %cmp17 = icmp sgt i32 %LastIndex, 0
+ br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ %conv5 = sext i16 %Scale to i64
+ %sh_prom = and i64 %conv5, 4294967295
+ %0 = sext i16 %lag to i64
+ %wide.trip.count = zext i32 %LastIndex to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %conv8 = trunc i64 %add7 to i32
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+ ret i32 %Accumulator.0.lcssa
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+ %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+ %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+ %1 = load i16, i16* %arrayidx, align 2
+ %conv = sext i16 %1 to i64
+ %2 = add nsw i64 %indvars.iv, %0
+ %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+ %3 = load i16, i16* %arrayidx3, align 2
+ %conv4 = sext i16 %3 to i64
+ %mul = mul nsw i64 %conv4, %conv
+ %shr = ashr i64 %mul, %sh_prom
+ %add7 = add i64 %shr, %Accumulator.018
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 3a581ebf847ec..7f381ae6ad7b5 100644
--- a/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -1,18 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-;CHECK-LABEL: @foo(
-;CHECK: icmp sgt
-;CHECK: icmp sgt
-;CHECK: icmp slt
-;CHECK: select <4 x i1>
-;CHECK: %[[P1:.*]] = select <4 x i1>
-;CHECK: xor <4 x i1>
-;CHECK: and <4 x i1>
-;CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %[[P1]]
-;CHECK: ret
define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]]
+; CHECK: min.iters.checked:
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[N]], 3
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0
+; CHECK-NEXT: br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3
+; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP17:%.*]] = and <4 x i1> [[TMP11]], [[TMP16]]
+; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP18]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
+; CHECK: if.then:
+; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP20]], 19
+; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
+; CHECK: if.else:
+; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP21]], 4
+; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
+; CHECK-NEXT: br label [[IF_END14]]
+; CHECK: if.end14:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ]
+; CHECK-NEXT: store i32 [[X_0]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !8
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: ret i32 undef
+;
entry:
%cmp26 = icmp sgt i32 %n, 0
br i1 %cmp26, label %for.body, label %for.end
@@ -46,3 +120,4 @@ if.end14:
for.end:
ret i32 undef
}
+
diff --git a/test/Transforms/LoopVectorize/pr33706.ll b/test/Transforms/LoopVectorize/pr33706.ll
new file mode 100644
index 0000000000000..b9d0d8a44accb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr33706.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s
+
+@global = local_unnamed_addr global i32 0, align 4
+@global.1 = local_unnamed_addr global i32 0, align 4
+@global.2 = local_unnamed_addr global float 0x3EF0000000000000, align 4
+
+; CHECK-LABEL: @PR33706
+; CHECK-NOT: <2 x i32>
+define void @PR33706(float* nocapture readonly %arg, float* nocapture %arg1, i32 %arg2) local_unnamed_addr {
+bb:
+ %tmp = load i32, i32* @global.1, align 4
+ %tmp3 = getelementptr inbounds float, float* %arg, i64 190
+ %tmp4 = getelementptr inbounds float, float* %arg1, i64 512
+ %tmp5 = and i32 %tmp, 65535
+ %tmp6 = icmp ugt i32 %arg2, 65536
+ br i1 %tmp6, label %bb7, label %bb9
+
+bb7: ; preds = %bb
+ %tmp8 = load i32, i32* @global, align 4
+ br label %bb27
+
+bb9: ; preds = %bb
+ %tmp10 = udiv i32 65536, %arg2
+ br label %bb11
+
+bb11: ; preds = %bb11, %bb9
+ %tmp12 = phi i32 [ %tmp20, %bb11 ], [ %tmp5, %bb9 ]
+ %tmp13 = phi float* [ %tmp18, %bb11 ], [ %tmp4, %bb9 ]
+ %tmp14 = phi i32 [ %tmp16, %bb11 ], [ %tmp10, %bb9 ]
+ %tmp15 = phi i32 [ %tmp19, %bb11 ], [ %tmp, %bb9 ]
+ %tmp16 = add nsw i32 %tmp14, -1
+ %tmp17 = sitofp i32 %tmp12 to float
+ store float %tmp17, float* %tmp13, align 4
+ %tmp18 = getelementptr inbounds float, float* %tmp13, i64 1
+ %tmp19 = add i32 %tmp15, %arg2
+ %tmp20 = and i32 %tmp19, 65535
+ %tmp21 = icmp eq i32 %tmp16, 0
+ br i1 %tmp21, label %bb22, label %bb11
+
+bb22: ; preds = %bb11
+ %tmp23 = phi float* [ %tmp18, %bb11 ]
+ %tmp24 = phi i32 [ %tmp19, %bb11 ]
+ %tmp25 = phi i32 [ %tmp20, %bb11 ]
+ %tmp26 = ashr i32 %tmp24, 16
+ store i32 %tmp26, i32* @global, align 4
+ br label %bb27
+
+bb27: ; preds = %bb22, %bb7
+ %tmp28 = phi i32 [ %tmp26, %bb22 ], [ %tmp8, %bb7 ]
+ %tmp29 = phi float* [ %tmp23, %bb22 ], [ %tmp4, %bb7 ]
+ %tmp30 = phi i32 [ %tmp25, %bb22 ], [ %tmp5, %bb7 ]
+ %tmp31 = sext i32 %tmp28 to i64
+ %tmp32 = getelementptr inbounds float, float* %tmp3, i64 %tmp31
+ %tmp33 = load float, float* %tmp32, align 4
+ %tmp34 = sitofp i32 %tmp30 to float
+ %tmp35 = load float, float* @global.2, align 4
+ %tmp36 = fmul float %tmp35, %tmp34
+ %tmp37 = fadd float %tmp33, %tmp36
+ store float %tmp37, float* %tmp29, align 4
+ ret void
+}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml
index 17b634acd0e1a..558aa9aa73f25 100644
--- a/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml
+++ b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml
@@ -16,4 +16,5 @@ CfiFunctionDefs:
CfiFunctionDecls:
- external
- external_weak
+ - local_decl
...
diff --git a/test/Transforms/LowerTypeTests/import-icall.ll b/test/Transforms/LowerTypeTests/import-icall.ll
index ddeb7fb5c9a2b..b4e374720321e 100644
--- a/test/Transforms/LowerTypeTests/import-icall.ll
+++ b/test/Transforms/LowerTypeTests/import-icall.ll
@@ -19,6 +19,10 @@ define i8 @use_b() {
ret i8 %x
}
+define void @local_decl() {
+ call void @local_decl()
+ ret void
+}
declare void @external()
declare extern_weak void @external_weak()
@@ -33,6 +37,9 @@ declare extern_weak void @external_weak()
; CHECK: define internal i8 @local_b() {
; CHECK-NEXT: call i8 @local_a()
+; CHECK: define void @local_decl()
+; CHECK-NEXT: call void @local_decl()
+
; CHECK: declare void @external()
; CHECK: declare extern_weak void @external_weak()
; CHECK: declare i8 @local_a()
diff --git a/test/Transforms/NewGVN/pr33720.ll b/test/Transforms/NewGVN/pr33720.ll
new file mode 100644
index 0000000000000..3b6c190a44944
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33720.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn -S %s | FileCheck %s
+
+@f = external local_unnamed_addr global i64
+@b = external local_unnamed_addr global i64
+@e = external local_unnamed_addr global i64
+
+define void @patatino() {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 undef, label [[IF_END24:%.*]], label [[FOR_COND16:%.*]]
+; CHECK: for.cond2thread-pre-split:
+; CHECK-NEXT: br i1 false, label [[FOR_BODY:%.*]], label [[FOR_COND8_PREHEADER:%.*]]
+; CHECK: for.cond8.preheader:
+; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label %for.cond11thread-pre-split.lr.ph
+; CHECK: for.cond11thread-pre-split.lr.ph:
+; CHECK-NEXT: br label [[L1]]
+; CHECK: for.body:
+; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[K_2:%.*]], 3
+; CHECK-NEXT: [[CONV4:%.*]] = zext i1 [[CMP3]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @f
+; CHECK-NEXT: [[OR:%.*]] = or i64 [[TMP0]], [[CONV4]]
+; CHECK-NEXT: store i64 [[OR]], i64* @f
+; CHECK-NEXT: [[TOBOOL7:%.*]] = icmp ne i64 [[K_2]], 0
+; CHECK-NEXT: br i1 [[TOBOOL7]], label %for.cond2thread-pre-split, label [[LOR_RHS:%.*]]
+; CHECK: lor.rhs:
+; CHECK-NEXT: store i64 1, i64* @b, align 8
+; CHECK-NEXT: br label %for.cond2thread-pre-split
+; CHECK: l1:
+; CHECK-NEXT: [[K_2]] = phi i64 [ undef, [[L1_PREHEADER:%.*]] ], [ 15, [[FOR_COND8_PREHEADER]] ], [ 5, %for.cond11thread-pre-split.lr.ph ]
+; CHECK-NEXT: store i64 7, i64* [[J_3:%.*]]
+; CHECK-NEXT: br label [[FOR_BODY]]
+; CHECK: for.cond16:
+; CHECK-NEXT: [[J_0:%.*]] = phi i64* [ @f, [[ENTRY:%.*]] ], [ undef, [[FOR_COND20:%.*]] ], [ @e, [[FOR_COND16]] ]
+; CHECK-NEXT: br i1 undef, label [[FOR_COND20]], label [[FOR_COND16]]
+; CHECK: for.cond20:
+; CHECK-NEXT: [[J_2:%.*]] = phi i64* [ [[J_0]], [[FOR_COND16]] ], [ undef, [[IF_END24]] ]
+; CHECK-NEXT: br i1 true, label [[IF_END24]], label [[FOR_COND16]]
+; CHECK: if.end24:
+; CHECK-NEXT: [[J_3]] = phi i64* [ [[J_2]], [[FOR_COND20]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT: br i1 false, label [[FOR_COND20]], label [[L1_PREHEADER]]
+; CHECK: l1.preheader:
+; CHECK-NEXT: br label [[L1]]
+;
+entry:
+ br i1 undef, label %if.end24, label %for.cond16
+
+for.cond2thread-pre-split:
+ br i1 false, label %for.body, label %for.cond8.preheader
+
+for.cond8.preheader:
+ br i1 undef, label %l1, label %for.cond11thread-pre-split.lr.ph
+
+for.cond11thread-pre-split.lr.ph:
+ br label %l1
+
+for.body:
+ %k.031 = phi i64 [ %k.2, %l1 ], [ 15, %for.cond2thread-pre-split ]
+ %cmp3 = icmp ne i64 %k.031, 3
+ %conv4 = zext i1 %cmp3 to i64
+ %0 = load i64, i64* @f
+ %or = or i64 %0, %conv4
+ store i64 %or, i64* @f
+ %tobool7 = icmp ne i64 %k.031, 0
+ %or.cond = or i1 %tobool7, false
+ br i1 %or.cond, label %for.cond2thread-pre-split, label %lor.rhs
+
+lor.rhs:
+ store i64 1, i64* @b, align 8
+ br label %for.cond2thread-pre-split
+
+l1:
+ %k.2 = phi i64 [ undef, %l1.preheader ], [ 15, %for.cond8.preheader ], [ 5, %for.cond11thread-pre-split.lr.ph ]
+ store i64 7, i64* %j.3
+ br label %for.body
+
+for.cond16:
+ %j.0 = phi i64* [ @f, %entry ], [ %j.2, %for.cond20 ], [ @e, %for.cond16 ]
+ br i1 undef, label %for.cond20, label %for.cond16
+
+for.cond20:
+ %j.2 = phi i64* [ %j.0, %for.cond16 ], [ %j.3, %if.end24 ]
+ br i1 true, label %if.end24, label %for.cond16
+
+if.end24:
+ %j.3 = phi i64* [ %j.2, %for.cond20 ], [ undef, %entry ]
+ br i1 false, label %for.cond20, label %l1.preheader
+
+l1.preheader:
+ br label %l1
+}
diff --git a/test/Transforms/PGOProfile/counter_promo_exit_merge.ll b/test/Transforms/PGOProfile/counter_promo_exit_merge.ll
index f53d37600ce61..85ca1613c8ad3 100644
--- a/test/Transforms/PGOProfile/counter_promo_exit_merge.ll
+++ b/test/Transforms/PGOProfile/counter_promo_exit_merge.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
-; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
+; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
+; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
$__llvm_profile_raw_version = comdat any
diff --git a/test/Transforms/PGOProfile/counter_promo_mexits.ll b/test/Transforms/PGOProfile/counter_promo_mexits.ll
index 71e5f066d50f3..bb799757a47cc 100644
--- a/test/Transforms/PGOProfile/counter_promo_mexits.ll
+++ b/test/Transforms/PGOProfile/counter_promo_mexits.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
-; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
+; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
@g = common local_unnamed_addr global i32 0, align 4
diff --git a/test/Transforms/PGOProfile/counter_promo_nest.ll b/test/Transforms/PGOProfile/counter_promo_nest.ll
new file mode 100644
index 0000000000000..b7f117b3e9496
--- /dev/null
+++ b/test/Transforms/PGOProfile/counter_promo_nest.ll
@@ -0,0 +1,165 @@
+; TEST that counter updates are promoted outside the whole loop nest
+; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
+
+@g = common local_unnamed_addr global i32 0, align 4
+@c = local_unnamed_addr global i32 10, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define void @bar() local_unnamed_addr #0 {
+bb:
+ %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+ %tmp3 = add nsw i32 %tmp2, 1
+ store i32 %tmp3, i32* @g, align 4, !tbaa !2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @main() local_unnamed_addr #1 {
+bb:
+ store i32 0, i32* @g, align 4, !tbaa !2
+ %tmp = load i32, i32* @c, align 4, !tbaa !2
+ %tmp1 = icmp sgt i32 %tmp, 0
+ br i1 %tmp1, label %bb2_1, label %bb84
+
+bb2_1:
+ br label %bb2
+
+bb2: ; preds = %bb39, %bb
+ %tmp3 = phi i32 [ %tmp40, %bb39 ], [ %tmp, %bb2_1 ]
+ %tmp5 = phi i32 [ %tmp43, %bb39 ], [ 0, %bb2_1 ]
+ %tmp7 = icmp sgt i32 %tmp3, 0
+ br i1 %tmp7, label %bb14_1, label %bb39
+
+bb8: ; preds = %bb39
+; PROMO-LABEL: bb8
+; PROMO: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+
+ %tmp13 = icmp sgt i32 %tmp40, 0
+ br i1 %tmp13, label %bb45, label %bb84
+
+bb14_1:
+ br label %bb14
+
+bb14: ; preds = %bb29, %bb2
+ %tmp15 = phi i32 [ %tmp30, %bb29 ], [ %tmp3, %bb14_1 ]
+ %tmp16 = phi i64 [ %tmp31, %bb29 ], [ 0, %bb14_1 ]
+ %tmp17 = phi i64 [ %tmp32, %bb29 ], [ 0, %bb14_1 ]
+ %tmp18 = phi i32 [ %tmp33, %bb29 ], [ 0, %bb14_1 ]
+ %tmp19 = icmp sgt i32 %tmp15, 0
+ br i1 %tmp19, label %bb20_split, label %bb29
+
+bb20_split:
+ br label %bb20
+
+bb20: ; preds = %bb20, %bb14
+ %tmp21 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb20_split ]
+ %tmp22 = phi i32 [ %tmp24, %bb20 ], [ 0, %bb20_split ]
+ %tmp23 = add nuw i64 %tmp21, 1
+ tail call void @bar()
+ %tmp24 = add nuw nsw i32 %tmp22, 1
+ %tmp25 = load i32, i32* @c, align 4, !tbaa !2
+ %tmp26 = icmp slt i32 %tmp24, %tmp25
+ br i1 %tmp26, label %bb20, label %bb27
+
+bb27: ; preds = %bb20
+ %tmp28 = add i64 %tmp23, %tmp16
+ br label %bb29
+
+bb29: ; preds = %bb27, %bb14
+ %tmp30 = phi i32 [ %tmp25, %bb27 ], [ %tmp15, %bb14 ]
+ %tmp31 = phi i64 [ %tmp28, %bb27 ], [ %tmp16, %bb14 ]
+ %tmp32 = add nuw i64 %tmp17, 1
+ %tmp33 = add nuw nsw i32 %tmp18, 1
+ %tmp34 = icmp slt i32 %tmp33, %tmp30
+ br i1 %tmp34, label %bb14, label %bb35
+
+bb35: ; preds = %bb29
+ %tmp36 = insertelement <2 x i64> undef, i64 %tmp31, i32 0
+ br label %bb39
+
+bb39: ; preds = %bb35, %bb2
+ %tmp40 = phi i32 [ %tmp30, %bb35 ], [ %tmp3, %bb2 ]
+ %tmp43 = add nuw nsw i32 %tmp5, 1
+ %tmp44 = icmp slt i32 %tmp43, %tmp40
+ br i1 %tmp44, label %bb2, label %bb8
+
+bb45: ; preds = %bb67, %bb8
+ %tmp46 = phi i32 [ %tmp68, %bb67 ], [ %tmp40, %bb8 ]
+ %tmp47 = phi i64 [ %tmp69, %bb67 ], [ 0, %bb8 ]
+ %tmp48 = phi i64 [ %tmp70, %bb67 ], [ 0, %bb8 ]
+ %tmp49 = phi i32 [ %tmp71, %bb67 ], [ 0, %bb8 ]
+ %tmp50 = icmp sgt i32 %tmp46, 0
+ br i1 %tmp50, label %bb57, label %bb67
+
+bb51: ; preds = %bb67
+ %tmp56 = icmp sgt i32 %tmp68, 0
+ br i1 %tmp56, label %bb73, label %bb84
+
+bb57: ; preds = %bb57, %bb45
+ %tmp58 = phi i64 [ %tmp60, %bb57 ], [ 0, %bb45 ]
+ %tmp59 = phi i32 [ %tmp61, %bb57 ], [ 0, %bb45 ]
+ %tmp60 = add nuw i64 %tmp58, 1
+ tail call void @bar()
+ %tmp61 = add nuw nsw i32 %tmp59, 1
+ %tmp62 = load i32, i32* @c, align 4, !tbaa !2
+ %tmp63 = mul nsw i32 %tmp62, 10
+ %tmp64 = icmp slt i32 %tmp61, %tmp63
+ br i1 %tmp64, label %bb57, label %bb65
+
+bb65: ; preds = %bb57
+ %tmp66 = add i64 %tmp60, %tmp47
+ br label %bb67
+
+bb67: ; preds = %bb65, %bb45
+ %tmp68 = phi i32 [ %tmp62, %bb65 ], [ %tmp46, %bb45 ]
+ %tmp69 = phi i64 [ %tmp66, %bb65 ], [ %tmp47, %bb45 ]
+ %tmp70 = add nuw i64 %tmp48, 1
+ %tmp71 = add nuw nsw i32 %tmp49, 1
+ %tmp72 = icmp slt i32 %tmp71, %tmp68
+ br i1 %tmp72, label %bb45, label %bb51
+
+bb73: ; preds = %bb73, %bb51
+ %tmp74 = phi i64 [ %tmp76, %bb73 ], [ 0, %bb51 ]
+ %tmp75 = phi i32 [ %tmp77, %bb73 ], [ 0, %bb51 ]
+ %tmp76 = add nuw i64 %tmp74, 1
+ tail call void @bar()
+ %tmp77 = add nuw nsw i32 %tmp75, 1
+ %tmp78 = load i32, i32* @c, align 4, !tbaa !2
+ %tmp79 = mul nsw i32 %tmp78, 100
+ %tmp80 = icmp slt i32 %tmp77, %tmp79
+ br i1 %tmp80, label %bb73, label %bb81
+
+bb81: ; preds = %bb73
+ br label %bb84
+
+bb84: ; preds = %bb81, %bb51, %bb8, %bb
+ ret i32 0
+}
+
+attributes #0 = { noinline }
+attributes #1 = { norecurse nounwind uwtable }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 307355)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/SimplifyCFG/implied-and-or.ll b/test/Transforms/SimplifyCFG/implied-and-or.ll
new file mode 100644
index 0000000000000..e615f302feefd
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/implied-and-or.ll
@@ -0,0 +1,183 @@
+; RUN: opt %s -S -simplifycfg | FileCheck %s
+
+declare void @foo()
+declare void @bar()
+
+
+; CHECK-LABEL: @test_and1
+; CHECK: taken:
+; CHECK-NOT: cmp3
+; CHECK: call void @bar()
+; CHECK-NEXT: call void @foo()
+; CHECK: ret
+define void @test_and1(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp2 = icmp eq i32 %b, 0
+ %and = and i1 %cmp1, %cmp2
+ br i1 %and, label %taken, label %end
+
+taken:
+ call void @bar()
+ %cmp3 = icmp eq i32 %a, 0 ;; <-- implied true
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
+
+; We can't infer anything if the result of the 'and' is false
+; CHECK-LABEL: @test_and2
+; CHECK: taken:
+; CHECK: call void @bar()
+; CHECK: %cmp3
+; CHECK: br i1 %cmp3
+; CHECK: if.then:
+; CHECK: call void @foo()
+; CHECK: ret
+define void @test_and2(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp2 = icmp eq i32 %b, 0
+ %and = and i1 %cmp1, %cmp2
+ br i1 %and, label %end, label %taken
+
+taken:
+ call void @bar()
+ %cmp3 = icmp eq i32 %a, 0
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
+
+; CHECK-LABEL: @test_or1
+; CHECK: taken:
+; CHECK-NOT: cmp3
+; CHECK: call void @bar()
+; CHECK-NEXT: call void @foo()
+; CHECK: ret
+define void @test_or1(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp2 = icmp eq i32 %b, 0
+ %or = or i1 %cmp1, %cmp2
+ br i1 %or, label %end, label %taken
+
+taken:
+ call void @bar()
+ %cmp3 = icmp ne i32 %a, 0 ;; <-- implied true
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
+
+; We can't infer anything if the result of the 'or' is true
+; CHECK-LABEL: @test_or2
+; CHECK: call void @bar()
+; CHECK: %cmp3
+; CHECK: br i1 %cmp3
+; CHECK: if.then:
+; CHECK: call void @foo()
+; CHECK: ret
+define void @test_or2(i32 %a, i32 %b) {
+entry:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp2 = icmp eq i32 %b, 0
+ %or = or i1 %cmp1, %cmp2
+ br i1 %or, label %taken, label %end
+
+taken:
+ call void @bar()
+ %cmp3 = icmp eq i32 %a, 0
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
+
+; We can recurse a tree of 'and' or 'or's.
+; CHECK-LABEL: @test_and_recurse1
+; CHECK: taken:
+; CHECK-NEXT: call void @bar()
+; CHECK-NEXT: call void @foo()
+; CHECK-NEXT: br label %end
+; CHECK: ret
+define void @test_and_recurse1(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cmpa = icmp eq i32 %a, 0
+ %cmpb = icmp eq i32 %b, 0
+ %cmpc = icmp eq i32 %c, 0
+ %and1 = and i1 %cmpa, %cmpb
+ %and2 = and i1 %and1, %cmpc
+ br i1 %and2, label %taken, label %end
+
+taken:
+ call void @bar()
+ %cmp3 = icmp eq i32 %a, 0
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
+
+; Check to make sure we don't recurse too deep.
+; CHECK-LABEL: @test_and_recurse2
+; CHECK: taken:
+; CHECK-NEXT: call void @bar()
+; CHECK-NEXT: %cmp3 = icmp eq i32 %a, 0
+; CHECK-NEXT: br i1 %cmp3, label %if.then, label %end
+; CHECK: ret
+define void @test_and_recurse2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
+ i32 %g, i32 %h) {
+entry:
+ %cmpa = icmp eq i32 %a, 0
+ %cmpb = icmp eq i32 %b, 0
+ %cmpc = icmp eq i32 %c, 0
+ %cmpd = icmp eq i32 %d, 0
+ %cmpe = icmp eq i32 %e, 0
+ %cmpf = icmp eq i32 %f, 0
+ %cmpg = icmp eq i32 %g, 0
+ %cmph = icmp eq i32 %h, 0
+ %and1 = and i1 %cmpa, %cmpb
+ %and2 = and i1 %and1, %cmpc
+ %and3 = and i1 %and2, %cmpd
+ %and4 = and i1 %and3, %cmpe
+ %and5 = and i1 %and4, %cmpf
+ %and6 = and i1 %and5, %cmpg
+ %and7 = and i1 %and6, %cmph
+ br i1 %and7, label %taken, label %end
+
+taken:
+ call void @bar()
+ %cmp3 = icmp eq i32 %a, 0 ; <-- can be implied true
+ br i1 %cmp3, label %if.then, label %end
+
+if.then:
+ call void @foo()
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
index 0f7bfa8516c96..513da477607b9 100644
--- a/test/Transforms/SimplifyCFG/sink-common-code.ll
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -818,6 +818,30 @@ merge:
; CHECK: right:
; CHECK-NEXT: %val1 = call i32 @call_target() [ "deopt"(i32 20) ]
+%T = type {i32, i32}
+
+define i32 @test_insertvalue(i1 zeroext %flag, %T %P) {
+entry:
+ br i1 %flag, label %if.then, label %if.else
+
+if.then:
+ %t1 = insertvalue %T %P, i32 0, 0
+ br label %if.end
+
+if.else:
+ %t2 = insertvalue %T %P, i32 1, 0
+ br label %if.end
+
+if.end:
+ %t = phi %T [%t1, %if.then], [%t2, %if.else]
+ ret i32 1
+}
+
+; CHECK-LABEL: @test_insertvalue
+; CHECK: select
+; CHECK: insertvalue
+; CHECK-NOT: insertvalue
+
; CHECK: ![[TBAA]] = !{![[TYPE:[0-9]]], ![[TYPE]], i64 0}
; CHECK: ![[TYPE]] = !{!"float", ![[TEXT:[0-9]]]}
; CHECK: ![[TEXT]] = !{!"an example type tree"}
diff --git a/test/Transforms/Sink/fence.ll b/test/Transforms/Sink/fence.ll
index aa237d8192b63..09aa565d88f80 100644
--- a/test/Transforms/Sink/fence.ll
+++ b/test/Transforms/Sink/fence.ll
@@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test1(i32* ()*) {
entry:
%1 = call i32* %0() #0
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
%2 = load i32, i32* %1, align 4
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
%3 = icmp eq i32 %2, 0
br i1 %3, label %fail, label %pass
@@ -20,9 +20,9 @@ pass: ; preds = %fail, %top
; CHECK-LABEL: @test1(
; CHECK: %[[call:.*]] = call i32* %0()
-; CHECK: fence singlethread seq_cst
+; CHECK: fence syncscope("singlethread") seq_cst
; CHECK: load i32, i32* %[[call]], align 4
-; CHECK: fence singlethread seq_cst
+; CHECK: fence syncscope("singlethread") seq_cst
attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll b/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll
new file mode 100644
index 0000000000000..661d0739401a7
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll
@@ -0,0 +1,37 @@
+; Test for a bug specific to the new pass manager where we may build a domtree
+; to make more precise AA queries for functions.
+;
+; RUN: opt -aa-pipeline=default -passes='no-op-module' -debug-pass-manager -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.hoge = type { %struct.widget }
+%struct.widget = type { i32 (...)** }
+
+; M0: @global = local_unnamed_addr global
+; M1-NOT: @global
+@global = local_unnamed_addr global %struct.hoge { %struct.widget { i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @global.1, i32 0, inrange i32 0, i32 2) to i32 (...)**) } }, align 8
+
+; M0: @global.1 = external unnamed_addr constant
+; M1: @global.1 = linkonce_odr unnamed_addr constant
+@global.1 = linkonce_odr unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @global.4 to i8*), i8* bitcast (i32 (%struct.widget*)* @quux to i8*)] }, align 8, !type !0
+
+; M0: @global.2 = external global
+; M1-NOT: @global.2
+@global.2 = external global i8*
+
+; M0: @global.3 = linkonce_odr constant
+; M1-NOT: @global.3
+@global.3 = linkonce_odr constant [22 x i8] c"zzzzzzzzzzzzzzzzzzzzz\00"
+
+; M0: @global.4 = linkonce_odr constant
+; M1: @global.4 = external constant
+@global.4 = linkonce_odr constant { i8*, i8* }{ i8* bitcast (i8** getelementptr inbounds (i8*, i8** @global.2, i64 2) to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @global.3, i32 0, i32 0) }
+
+@llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
+
+declare i32 @quux(%struct.widget*) unnamed_addr
+
+!0 = !{i64 16, !"yyyyyyyyyyyyyyyyyyyyyyyyy"}
diff --git a/test/Unit/lit.cfg b/test/Unit/lit.cfg
index dac0bf829ba6f..9da82f5f2c9bd 100644
--- a/test/Unit/lit.cfg
+++ b/test/Unit/lit.cfg
@@ -3,6 +3,7 @@
# Configuration file for the 'lit' test runner.
import os
+import subprocess
import lit.formats
@@ -75,8 +76,8 @@ if config.test_exec_root is None:
lit_config.fatal('No site specific configuration available!')
# Get the source and object roots.
- llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
- llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
+ llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip()
+ llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip()
# Validate that we got a tree which points to here.
this_src_root = os.path.join(os.path.dirname(__file__),'..','..')
diff --git a/test/Verifier/2004-05-21-SwitchConstantMismatch.ll b/test/Verifier/2004-05-21-SwitchConstantMismatch.ll
index 339a21cac1907..fea290d74c4ad 100644
--- a/test/Verifier/2004-05-21-SwitchConstantMismatch.ll
+++ b/test/Verifier/2004-05-21-SwitchConstantMismatch.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
diff --git a/test/Verifier/2007-12-21-InvokeParamAttrs.ll b/test/Verifier/2007-12-21-InvokeParamAttrs.ll
index 709b47b33daa1..c62bc0f4e190f 100644
--- a/test/Verifier/2007-12-21-InvokeParamAttrs.ll
+++ b/test/Verifier/2007-12-21-InvokeParamAttrs.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @foo(i8*)
diff --git a/test/Verifier/2008-01-11-VarargAttrs.ll b/test/Verifier/2008-01-11-VarargAttrs.ll
index af97ce6474492..d3eb7c72699a3 100644
--- a/test/Verifier/2008-01-11-VarargAttrs.ll
+++ b/test/Verifier/2008-01-11-VarargAttrs.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%struct = type { }
diff --git a/test/Verifier/2009-05-29-InvokeResult1.ll b/test/Verifier/2009-05-29-InvokeResult1.ll
index bb815b3bfe159..38679f4c49fc1 100644
--- a/test/Verifier/2009-05-29-InvokeResult1.ll
+++ b/test/Verifier/2009-05-29-InvokeResult1.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare i32 @v()
diff --git a/test/Verifier/2009-05-29-InvokeResult2.ll b/test/Verifier/2009-05-29-InvokeResult2.ll
index 900b1d827bf45..92a51d71efe65 100644
--- a/test/Verifier/2009-05-29-InvokeResult2.ll
+++ b/test/Verifier/2009-05-29-InvokeResult2.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare i32 @v()
diff --git a/test/Verifier/2009-05-29-InvokeResult3.ll b/test/Verifier/2009-05-29-InvokeResult3.ll
index 050de4669d350..3fff219cab7dc 100644
--- a/test/Verifier/2009-05-29-InvokeResult3.ll
+++ b/test/Verifier/2009-05-29-InvokeResult3.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare i32 @v()
diff --git a/test/Verifier/byval-1.ll b/test/Verifier/byval-1.ll
index 9bbead0861146..9d09a0ffb1176 100644
--- a/test/Verifier/byval-1.ll
+++ b/test/Verifier/byval-1.ll
@@ -1,2 +1,2 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @h(i32 byval %num)
diff --git a/test/Verifier/element-wise-atomic-memory-intrinsics.ll b/test/Verifier/element-wise-atomic-memory-intrinsics.ll
index 470c861c50573..81c8ba16b97d1 100644
--- a/test/Verifier/element-wise-atomic-memory-intrinsics.ll
+++ b/test/Verifier/element-wise-atomic-memory-intrinsics.ll
@@ -22,4 +22,46 @@ define void @test_memcpy(i8* %P, i8* %Q, i32 %A, i32 %E) {
ret void
}
declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+
+define void @test_memmove(i8* %P, i8* %Q, i32 %A, i32 %E) {
+ ; CHECK: element size of the element-wise unordered atomic memory intrinsic must be a constant int
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 %E)
+ ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 3)
+
+ ; CHECK: constant length must be a multiple of the element size in the element-wise atomic memory intrinsic
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 7, i32 4)
+
+ ; CHECK: incorrect alignment of the destination argument
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* %P, i8* align 4 %Q, i32 1, i32 1)
+ ; CHECK: incorrect alignment of the destination argument
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %P, i8* align 4 %Q, i32 4, i32 4)
+
+ ; CHECK: incorrect alignment of the source argument
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* %Q, i32 1, i32 1)
+ ; CHECK: incorrect alignment of the source argument
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 1 %Q, i32 4, i32 4)
+
+ ret void
+}
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+
+define void @test_memset(i8* %P, i8 %V, i32 %A, i32 %E) {
+ ; CHECK: element size of the element-wise unordered atomic memory intrinsic must be a constant int
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 %E)
+ ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 3)
+
+ ; CHECK: constant length must be a multiple of the element size in the element-wise atomic memory intrinsic
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 7, i32 4)
+
+ ; CHECK: incorrect alignment of the destination argument
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* %P, i8 %V, i32 1, i32 1)
+ ; CHECK: incorrect alignment of the destination argument
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %P, i8 %V, i32 4, i32 4)
+
+ ret void
+}
+declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture, i8, i32, i32) nounwind
+
; CHECK: input module is broken!
diff --git a/test/Verifier/gcread-ptrptr.ll b/test/Verifier/gcread-ptrptr.ll
index 4ed22fa6c24ec..f8b21bfb4c935 100644
--- a/test/Verifier/gcread-ptrptr.ll
+++ b/test/Verifier/gcread-ptrptr.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
; PR1633
%meta = type { i8* }
diff --git a/test/Verifier/gcroot-alloca.ll b/test/Verifier/gcroot-alloca.ll
index 8caa4b9f58b56..775bde78250e2 100644
--- a/test/Verifier/gcroot-alloca.ll
+++ b/test/Verifier/gcroot-alloca.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
; PR1633
%meta = type { i8* }
diff --git a/test/Verifier/gcroot-meta.ll b/test/Verifier/gcroot-meta.ll
index 1836f61c7ad6d..26f7b5156294f 100644
--- a/test/Verifier/gcroot-meta.ll
+++ b/test/Verifier/gcroot-meta.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
; PR1633
%meta = type { i8* }
diff --git a/test/Verifier/gcroot-ptrptr.ll b/test/Verifier/gcroot-ptrptr.ll
index b573295e3e94f..8d7557d75a491 100644
--- a/test/Verifier/gcroot-ptrptr.ll
+++ b/test/Verifier/gcroot-ptrptr.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
; PR1633
%meta = type { i8* }
diff --git a/test/Verifier/gcwrite-ptrptr.ll b/test/Verifier/gcwrite-ptrptr.ll
index 1f60becc33271..dec1e6bcd3345 100644
--- a/test/Verifier/gcwrite-ptrptr.ll
+++ b/test/Verifier/gcwrite-ptrptr.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
; PR1633
%meta = type { i8* }
diff --git a/test/lit.cfg b/test/lit.cfg
index ed1ba2d11b1a9..8ed9187aea77c 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -6,6 +6,7 @@ import os
import sys
import re
import platform
+import subprocess
import lit.util
import lit.formats
@@ -150,8 +151,8 @@ if config.test_exec_root is None:
lit_config.fatal('No site specific configuration available!')
# Get the source and object roots.
- llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
- llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
+ llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip()
+ llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip()
# Validate that we got a tree which points to here.
this_src_root = os.path.dirname(config.test_source_root)
diff --git a/test/tools/llvm-cov/threads.c b/test/tools/llvm-cov/threads.c
new file mode 100644
index 0000000000000..00a85edb7ce8b
--- /dev/null
+++ b/test/tools/llvm-cov/threads.c
@@ -0,0 +1,11 @@
+// Coverage/profile data recycled from the showLineExecutionCounts.cpp test.
+//
+// RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -j 1 -o %t1.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -num-threads 2 -o %t2.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -o %t3.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp
+//
+// RUN: diff %t1.dir/index.txt %t2.dir/index.txt
+// RUN: diff %t1.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %t2.dir/coverage/tmp/showLineExecutionCounts.cpp.txt
+// RUN: diff %t1.dir/index.txt %t3.dir/index.txt
+// RUN: diff %t1.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %t3.dir/coverage/tmp/showLineExecutionCounts.cpp.txt
diff --git a/test/tools/llvm-cov/zeroFunctionFile.c b/test/tools/llvm-cov/zeroFunctionFile.c
index 87b6ecd3abb3e..d5b983efb8179 100644
--- a/test/tools/llvm-cov/zeroFunctionFile.c
+++ b/test/tools/llvm-cov/zeroFunctionFile.c
@@ -13,7 +13,7 @@ int main() {
// REPORT: 0 0 - 0 0 - 0 0 - 0 0 -
// REPORT-NO: 0%
-// RUN: llvm-cov show %S/Inputs/zeroFunctionFile.covmapping -format html -instr-profile %t.profdata -o %t.dir
+// RUN: llvm-cov show -j 1 %S/Inputs/zeroFunctionFile.covmapping -format html -instr-profile %t.profdata -o %t.dir
// RUN: FileCheck %s -input-file=%t.dir/index.html -check-prefix=HTML
// HTML: <td class='column-entry-green'><pre>- (0/0)
// HTML-NO: 0.00% (0/0)
diff --git a/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm b/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm
new file mode 100644
index 0000000000000..79d19962e00b0
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm
Binary files differ
diff --git a/test/tools/llvm-objdump/ARM/macho-reloc-half.test b/test/tools/llvm-objdump/ARM/macho-reloc-half.test
new file mode 100644
index 0000000000000..888c7f5891168
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-reloc-half.test
@@ -0,0 +1,4 @@
+RUN: llvm-objdump -r %p/Inputs/reloc-half.obj.macho-arm | FileCheck %s
+
+CHECK-DAG: 00000004 ARM_RELOC_HALF :upper16:(_stringbuf)
+CHECK-DAG: 00000000 ARM_RELOC_HALF :lower16:(_stringbuf)
diff --git a/test/tools/llvm-objdump/Inputs/test.wasm b/test/tools/llvm-objdump/Inputs/test.wasm
deleted file mode 100644
index d3906eeaf6f86..0000000000000
--- a/test/tools/llvm-objdump/Inputs/test.wasm
+++ /dev/null
Binary files differ
diff --git a/test/tools/llvm-objdump/Inputs/trivial.ll b/test/tools/llvm-objdump/Inputs/trivial.ll
new file mode 100644
index 0000000000000..6dd510a12b66b
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/trivial.ll
@@ -0,0 +1,19 @@
+; Input used for generating checked-in binaries (trivial.obj.*)
+; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial.obj.wasm
+
+@.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+ %call = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0)) nounwind
+ tail call void bitcast (void (...)* @SomeOtherFunction to void ()*)() nounwind
+ ret i32 0
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+declare void @SomeOtherFunction(...)
+
+@var = global i32 0
+@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @var to i8*)], section "llvm.metadata"
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* null, i8* null }]
diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.wasm b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm
new file mode 100644
index 0000000000000..1f3947ac472e0
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm
Binary files differ
diff --git a/test/tools/llvm-objdump/WebAssembly/symbol-table.test b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
index 8936c7a12e4c6..2c49d5d65c5d7 100644
--- a/test/tools/llvm-objdump/WebAssembly/symbol-table.test
+++ b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
@@ -1,8 +1,11 @@
-RUN: llvm-objdump -t %p/../Inputs/test.wasm | FileCheck %s
+RUN: llvm-objdump -t %p/../Inputs/trivial.obj.wasm | FileCheck %s
+
+CHECK: SYMBOL TABLE:
+CHECK-NEXT: 00000000 l F IMPORT puts
+CHECK-NEXT: 00000000 l F IMPORT SomeOtherFunction
+CHECK-NEXT: 00000002 g F EXPORT main
+CHECK-NEXT: 00000001 g EXPORT var
+CHECK-NEXT: 00000000 l F name puts
+CHECK-NEXT: 00000001 l F name SomeOtherFunction
+CHECK-NEXT: 00000002 l F name main
-CHECK: SYMBOL TABLE:
-CHECK: 00000000 l F IMPORT bar
-CHECK: 00000000 g F EXPORT baz
-CHECK: 00000001 g F EXPORT quux
-CHECK: 00000000 l F name $import
-CHECK: 00000001 l F name $func0
diff --git a/test/tools/llvm-objdump/wasm.txt b/test/tools/llvm-objdump/wasm.txt
index 4aa40c6c9df8c..828fa34b2b46f 100644
--- a/test/tools/llvm-objdump/wasm.txt
+++ b/test/tools/llvm-objdump/wasm.txt
@@ -1,24 +1,27 @@
-# RUN: llvm-objdump -h %p/Inputs/test.wasm | FileCheck %s
+# RUN: llvm-objdump -h %p/Inputs/trivial.obj.wasm | FileCheck %s
-# CHECK: Sections:
-# CHECK: Idx Name Size Address Type
-# CHECK: 0 TYPE 0000000f 0000000000000000
-# CHECK: 1 IMPORT 0000000b 0000000000000000
-# CHECK: 2 FUNCTION 00000003 0000000000000000
-# CHECK: 3 TABLE 00000005 0000000000000000
-# CHECK: 4 EXPORT 0000000e 0000000000000000
-# CHECK: 5 ELEM 00000007 0000000000000000
-# CHECK: 6 CODE 0000002a 0000000000000000 TEXT
-# CHECK: 7 name 0000003c 0000000000000000
+# CHECK: Sections:
+# CHECK-NEXT: Idx Name Size Address Type
+# CHECK-NEXT: 0 TYPE 0000000e 0000000000000000
+# CHECK-NEXT: 1 IMPORT 00000024 0000000000000000
+# CHECK-NEXT: 2 FUNCTION 00000002 0000000000000000
+# CHECK-NEXT: 3 TABLE 00000004 0000000000000000
+# CHECK-NEXT: 4 MEMORY 00000003 0000000000000000
+# CHECK-NEXT: 5 GLOBAL 0000000b 0000000000000000
+# CHECK-NEXT: 6 EXPORT 0000000e 0000000000000000
+# CHECK-NEXT: 7 CODE 00000019 0000000000000000 TEXT
+# CHECK-NEXT: 8 DATA 0000001a 0000000000000000 DATA
+# CHECK-NEXT: 9 name 0000002b 0000000000000000
+# CHECK-NEXT: 10 reloc.CODE 00000017 0000000000000000
+# CHECK-NEXT: 11 linking 00000016 0000000000000000
-# RUN: llvm-objdump -p %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-HEADER
+# RUN: llvm-objdump -p %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-HEADER
# CHECK-HEADER: Program Header:
# CHECK-HEADER: Version: 0x1
-# RUN: llvm-objdump -s --section=CODE %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-SECTIONS
+# RUN: llvm-objdump -s --section=CODE %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-SECTIONS
# CHECK-SECTIONS: Contents of section CODE:
-# CHECK-SECTIONS: 0000 02070043 0000803f 0b200201 7d017c10 ...C...?. ..}.|.
-# CHECK-SECTIONS: 0010 001a4100 10011a41 00410111 00001a20 ..A....A.A.....
-# CHECK-SECTIONS: 0020 011a4300 00000021 020b ..C....!..
+# CHECK-SECTIONS: 0000 01170041 80808080 00108080 8080001a ...A............
+# CHECK-SECTIONS: 0010 10818080 80004100 0b ......A..
diff --git a/test/tools/llvm-pdbdump/partial-type-stream.test b/test/tools/llvm-pdbdump/partial-type-stream.test
index 3a853c3914506..7c62acce7ad4b 100644
--- a/test/tools/llvm-pdbdump/partial-type-stream.test
+++ b/test/tools/llvm-pdbdump/partial-type-stream.test
@@ -17,8 +17,7 @@ DEPS: Types (TPI Stream)
DEPS-NEXT: ============================================================
DEPS-NEXT: Showing 1 records and their dependents (4 records total)
DEPS-NEXT: 0x100E | LF_ARGLIST [size = 8]
-DEPS-NEXT: 0x1017 | LF_CLASS [size = 60]
-DEPS-NEXT: class name: `MembersTest::A`
+DEPS-NEXT: 0x1017 | LF_CLASS [size = 60] `MembersTest::A`
DEPS-NEXT: unique name: `.?AVA@MembersTest@@`
DEPS-NEXT: vtable: <no type>, base list: <no type>, field list: <no type>
DEPS-NEXT: options: forward ref | has unique name
diff --git a/test/tools/llvm-profdata/c-general.test b/test/tools/llvm-profdata/c-general.test
index 0ec7c113eb4c6..ddb95d1260d88 100644
--- a/test/tools/llvm-profdata/c-general.test
+++ b/test/tools/llvm-profdata/c-general.test
@@ -10,6 +10,7 @@ REGENERATE: $ clang -o a.out -fprofile-instr-generate $CFE_TESTDIR/c-general.c
REGENERATE: $ LLVM_PROFILE_FILE=$TESTDIR/Inputs/c-general.profraw ./a.out
RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - | FileCheck %s
+RUN: llvm-profdata show %p/Inputs/c-general.profraw --topn=3 -o - | FileCheck %s --check-prefix=TOPN
RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - --function=switches | FileCheck %s -check-prefix=SWITCHES -check-prefix=CHECK
SWITCHES-LABEL: Counters:
@@ -22,3 +23,6 @@ SWITCHES-LABEL: Functions shown: 1
CHECK-LABEL: Total functions: 12
CHECK-NEXT: Maximum function count: 1
CHECK-NEXT: Maximum internal block count: 100
+TOPN: boolean_operators, max count = 100
+TOPN-NEXT: simple_loops, max count = 100
+TOPN-NEXT: conditionals, max count = 100
diff --git a/test/tools/llvm-readobj/Inputs/trivial.ll b/test/tools/llvm-readobj/Inputs/trivial.ll
index f79b8b897691c..e0e519d064dea 100644
--- a/test/tools/llvm-readobj/Inputs/trivial.ll
+++ b/test/tools/llvm-readobj/Inputs/trivial.ll
@@ -1,9 +1,11 @@
-; llc -mtriple=i386-pc-win32 trivial.ll -filetype=obj -o trivial-object-test.coff-i386
-; llc -mtriple=x86_64-pc-win32 trivial.ll -filetype=obj -o trivial-object-test.coff-x86-64
-; llc -mtriple=i386-linux-gnu trivial.ll -filetype=obj -o trivial-object-test.elf-i386 -relocation-model=pic
-; llc -mtriple=x86_64-linux-gnu trivial.ll -filetype=obj -o trivial-object-test.elf-x86-64 -relocation-model=pic
-; llc -mtriple=i386-apple-darwin10 trivial.ll -filetype=obj -o trivial-object-test.macho-i386 -relocation-model=pic
-; llc -mtriple=x86_64-apple-darwin10 trivial.ll -filetype=obj -o trivial-object-test.macho-x86-64 -relocation-model=pic
+; Input used for generating checked-in binaries (trivial.obj.*)
+; llc -mtriple=i386-pc-win32 trivial.ll -filetype=obj -o trivial.obj.coff-i386
+; llc -mtriple=x86_64-pc-win32 trivial.ll -filetype=obj -o trivial.obj.coff-x86-64
+; llc -mtriple=i386-linux-gnu trivial.ll -filetype=obj -o trivial.obj.elf-i386 -relocation-model=pic
+; llc -mtriple=x86_64-linux-gnu trivial.ll -filetype=obj -o trivial.obj.elf-x86-64 -relocation-model=pic
+; llc -mtriple=i386-apple-darwin10 trivial.ll -filetype=obj -o trivial.obj.macho-i386 -relocation-model=pic
+; llc -mtriple=x86_64-apple-darwin10 trivial.ll -filetype=obj -o trivial.obj.macho-x86-64 -relocation-model=pic
+; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial.obj.wasm
@.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1
diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
index f14192f1798b0..caa702f700153 100644
--- a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
+++ b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
Binary files differ
diff --git a/test/tools/llvm-readobj/codeview-linetables.test b/test/tools/llvm-readobj/codeview-linetables.test
index fe68e7efdb056..9256aefe4330b 100644
--- a/test/tools/llvm-readobj/codeview-linetables.test
+++ b/test/tools/llvm-readobj/codeview-linetables.test
@@ -41,7 +41,7 @@ MFUN32: ]
MFUN32: Subsection [
MFUN32-NEXT: SubSectionType: Symbols (0xF1)
MFUN32-NEXT: SubSectionSize: 0x4B
-MFUN32: ProcStart {
+MFUN32: GlobalProcIdSym {
MFUN32: CodeSize: 0xA
MFUN32: DisplayName: x
MFUN32: LinkageName: _x
@@ -60,7 +60,7 @@ MFUN32: ]
MFUN32: Subsection [
MFUN32-NEXT: SubSectionType: Symbols (0xF1)
MFUN32-NEXT: SubSectionSize: 0x4B
-MFUN32: ProcStart {
+MFUN32: GlobalProcIdSym {
MFUN32: CodeSize: 0xA
MFUN32: DisplayName: y
MFUN32: LinkageName: _y
@@ -79,7 +79,7 @@ MFUN32: ]
MFUN32: Subsection [
MFUN32-NEXT: SubSectionType: Symbols (0xF1)
MFUN32-NEXT: SubSectionSize: 0x4B
-MFUN32: ProcStart {
+MFUN32: GlobalProcIdSym {
MFUN32: CodeSize: 0x14
MFUN32: DisplayName: f
MFUN32: LinkageName: _f
@@ -193,7 +193,7 @@ MFUN64: ]
MFUN64: Subsection [
MFUN64-NEXT: SubSectionType: Symbols (0xF1)
MFUN64-NEXT: SubSectionSize: 0x4B
-MFUN64: ProcStart {
+MFUN64: GlobalProcIdSym {
MFUN64: CodeSize: 0xE
MFUN64: DisplayName: x
MFUN64: LinkageName: x
@@ -208,7 +208,7 @@ MFUN64-NEXT: ]
MFUN64-NEXT: Subsection [
MFUN64-NEXT: SubSectionType: Symbols (0xF1)
MFUN64-NEXT: SubSectionSize: 0x4B
-MFUN64: ProcStart {
+MFUN64: GlobalProcIdSym {
MFUN64: CodeSize: 0xE
MFUN64: DisplayName: y
MFUN64: LinkageName: y
@@ -223,7 +223,7 @@ MFUN64-NEXT: ]
MFUN64-NEXT: Subsection [
MFUN64-NEXT: SubSectionType: Symbols (0xF1)
MFUN64-NEXT: SubSectionSize: 0x4B
-MFUN64: ProcStart {
+MFUN64: GlobalProcIdSym {
MFUN64: CodeSize: 0x18
MFUN64: DisplayName: f
MFUN64: LinkageName: f
@@ -365,7 +365,7 @@ MFILE32: ]
MFILE32: Subsection [
MFILE32-NEXT: SubSectionType: Symbols (0xF1)
MFILE32-NEXT: SubSectionSize: 0x4B
-MFILE32: ProcStart {
+MFILE32: GlobalProcIdSym {
MFILE32: CodeSize: 0x14
MFILE32: DisplayName: f
MFILE32: LinkageName: _f
@@ -442,7 +442,7 @@ MFILE64: ]
MFILE64: Subsection [
MFILE64-NEXT: SubSectionType: Symbols (0xF1)
MFILE64-NEXT: SubSectionSize: 0x4B
-MFILE64: ProcStart {
+MFILE64: GlobalProcIdSym {
MFILE64: CodeSize: 0x18
MFILE64: DisplayName: f
MFILE64: LinkageName: f
@@ -528,7 +528,7 @@ RUN: | FileCheck %s -check-prefix MCOMDAT
RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/comdat-function-linetables.obj.coff-2013-i386 \
RUN: | FileCheck %s -check-prefix MCOMDAT
-MCOMDAT: ProcStart {
+MCOMDAT: GlobalProcIdSym {
MCOMDAT: CodeSize: 0x7
MCOMDAT: DisplayName: f
MCOMDAT: LinkageName: ?f@@YAHXZ
@@ -556,7 +556,7 @@ MCOMDAT-NEXT: IsStatement: Yes
MCOMDAT-NEXT: ]
MCOMDAT-NEXT: ]
MCOMDAT-NEXT: ]
-MCOMDAT: ProcStart {
+MCOMDAT: GlobalProcIdSym {
MCOMDAT: CodeSize: 0x7
MCOMDAT: DisplayName: g
MCOMDAT: LinkageName: ?g@@YAHXZ
diff --git a/test/tools/llvm-readobj/file-headers.test b/test/tools/llvm-readobj/file-headers.test
index 6bc9714f2037e..65ccd50a27294 100644
--- a/test/tools/llvm-readobj/file-headers.test
+++ b/test/tools/llvm-readobj/file-headers.test
@@ -28,9 +28,6 @@ RUN: llvm-readobj -h %p/Inputs/magic.coff-importlib \
RUN: | FileCheck %s -check-prefix COFF-IMPORTLIB
RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-lanai \
RUN: | FileCheck %s -check-prefix ELF-LANAI
-# trivial.obj.wasm was generated using the following command:
-# echo "extern int bar, baz; int foo() { return bar + baz + (int)&foo; }" | \
-# ./bin/clang -c -o trivial.obj.wasm -target wasm32-unknown-unknown-wasm -x c -
RUN: llvm-readobj -h %p/Inputs/trivial.obj.wasm \
RUN: | FileCheck %s -check-prefix WASM
diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index 9c7dcf1d659c0..85ccd3cefa1b9 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test
@@ -289,21 +289,20 @@ MACHO-ARM-NEXT: ]
WASM: Relocations [
WASM-NEXT: Section (8) CODE {
WASM-NEXT: Relocation {
-WASM-NEXT: Type: R_WEBASSEMBLY_TABLE_INDEX_SLEB (1)
-WASM-NEXT: Offset: 0x6
+WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_SLEB (4)
+WASM-NEXT: Offset: 0x4
WASM-NEXT: Index: 0x0
+WASM-NEXT: Addend: 0
WASM-NEXT: }
WASM-NEXT: Relocation {
-WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_LEB (3)
-WASM-NEXT: Offset: 0x15
+WASM-NEXT: Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
+WASM-NEXT: Offset: 0xA
WASM-NEXT: Index: 0x0
-WASM-NEXT: Addend: 0
WASM-NEXT: }
WASM-NEXT: Relocation {
-WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_LEB (3)
-WASM-NEXT: Offset: 0x24
+WASM-NEXT: Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
+WASM-NEXT: Offset: 0x11
WASM-NEXT: Index: 0x1
-WASM-NEXT: Addend: 0
WASM-NEXT: }
WASM-NEXT: }
WASM-NEXT: ]
diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index 1747ee45d4f3c..4eda5dae882ac 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test
@@ -493,62 +493,75 @@ MACHO-ARM-NEXT: Reserved2: 0x0
MACHO-ARM-NEXT: }
MACHO-ARM-NEXT:]
-WASM: Sections [
-WASM-NEXT: Section {
-WASM-NEXT: Type: TYPE (0x1)
-WASM-NEXT: Size: 5
-WASM-NEXT: Offset: 8
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: IMPORT (0x2)
-WASM-NEXT: Size: 23
-WASM-NEXT: Offset: 19
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: FUNCTION (0x3)
-WASM-NEXT: Size: 2
-WASM-NEXT: Offset: 48
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: TABLE (0x4)
-WASM-NEXT: Size: 4
-WASM-NEXT: Offset: 56
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: MEMORY (0x5)
-WASM-NEXT: Size: 3
-WASM-NEXT: Offset: 66
-WASM-NEXT: Memories [
-WASM-NEXT: Memory {
-WASM-NEXT: InitialPages: 0
-WASM-NEXT: }
-WASM-NEXT: ]
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: EXPORT (0x7)
-WASM-NEXT: Size: 7
-WASM-NEXT: Offset: 75
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: ELEM (0x9)
-WASM-NEXT: Size: 7
-WASM-NEXT: Offset: 88
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: CODE (0xA)
-WASM-NEXT: Size: 61
-WASM-NEXT: Offset: 101
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: CUSTOM (0x0)
-WASM-NEXT: Size: 17
-WASM-NEXT: Offset: 168
-WASM-NEXT: Name: name
-WASM-NEXT: }
-WASM-NEXT: Section {
-WASM-NEXT: Type: CUSTOM (0x0)
-WASM-NEXT: Size: 24
-WASM-NEXT: Offset: 191
-WASM-NEXT: Name: reloc.CODE
-WASM-NEXT: }
-WASM-NEXT:]
+WASM: Sections [
+WASM-NEXT: Section {
+WASM-NEXT: Type: TYPE (0x1)
+WASM-NEXT: Size: 14
+WASM-NEXT: Offset: 8
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: IMPORT (0x2)
+WASM-NEXT: Size: 36
+WASM-NEXT: Offset: 28
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: FUNCTION (0x3)
+WASM-NEXT: Size: 2
+WASM-NEXT: Offset: 70
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: TABLE (0x4)
+WASM-NEXT: Size: 4
+WASM-NEXT: Offset: 78
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: MEMORY (0x5)
+WASM-NEXT: Size: 3
+WASM-NEXT: Offset: 88
+WASM-NEXT: Memories [
+WASM-NEXT: Memory {
+WASM-NEXT: InitialPages: 1
+WASM-NEXT: }
+WASM-NEXT: ]
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: GLOBAL (0x6)
+WASM-NEXT: Size: 6
+WASM-NEXT: Offset: 97
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: EXPORT (0x7)
+WASM-NEXT: Size: 8
+WASM-NEXT: Offset: 109
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: CODE (0xA)
+WASM-NEXT: Size: 25
+WASM-NEXT: Offset: 123
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: DATA (0xB)
+WASM-NEXT: Size: 19
+WASM-NEXT: Offset: 154
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: CUSTOM (0x0)
+WASM-NEXT: Size: 43
+WASM-NEXT: Offset: 179
+WASM-NEXT: Name: name
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: CUSTOM (0x0)
+WASM-NEXT: Size: 23
+WASM-NEXT: Offset: 228
+WASM-NEXT: Name: reloc.CODE
+WASM-NEXT: }
+WASM-NEXT: Section {
+WASM-NEXT: Type: CUSTOM (0x0)
+WASM-NEXT: Size: 22
+WASM-NEXT: Offset: 257
+WASM-NEXT: Name: linking
+WASM-NEXT: DataSize: 13
+WASM-NEXT: DataAlignment: 1
+WASM-NEXT: }
+WASM-NEXT: ]
diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test
index da8a70b031aba..380c6f6a5ee50 100644
--- a/test/tools/llvm-readobj/symbols.test
+++ b/test/tools/llvm-readobj/symbols.test
@@ -73,22 +73,32 @@ ELF-NEXT: }
WASM: Symbols [
WASM-NEXT: Symbol {
-WASM-NEXT: Name: bar
-WASM-NEXT: Type: GLOBAL_IMPORT (0x2)
+WASM-NEXT: Name: puts
+WASM-NEXT: Type: FUNCTION_IMPORT (0x0)
WASM-NEXT: Flags: 0x0
WASM-NEXT: }
WASM-NEXT: Symbol {
-WASM-NEXT: Name: baz
-WASM-NEXT: Type: GLOBAL_IMPORT (0x2)
+WASM-NEXT: Name: SomeOtherFunction
+WASM-NEXT: Type: FUNCTION_IMPORT (0x0)
WASM-NEXT: Flags: 0x0
WASM-NEXT: }
WASM-NEXT: Symbol {
-WASM-NEXT: Name: foo
+WASM-NEXT: Name: main
WASM-NEXT: Type: FUNCTION_EXPORT (0x1)
WASM-NEXT: Flags: 0x0
WASM-NEXT: }
WASM-NEXT: Symbol {
-WASM-NEXT: Name: foo
+WASM-NEXT: Name: puts
+WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4)
+WASM-NEXT: Flags: 0x0
+WASM-NEXT: }
+WASM-NEXT: Symbol {
+WASM-NEXT: Name: SomeOtherFunction
+WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4)
+WASM-NEXT: Flags: 0x0
+WASM-NEXT: }
+WASM-NEXT: Symbol {
+WASM-NEXT: Name: main
WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4)
WASM-NEXT: Flags: 0x0
WASM-NEXT: }